diff --git a/.clang-tidy b/.clang-tidy
index 1681ed66e..f9b77bce8 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,9 @@ ExtraArgs: []
 FormatStyle: file
 UseColor: true
 WarningsAsErrors: '*'
+# FIXME: Use `ExcludeHeaderFilterRegex` instead when all maintainers upgraded their `clang-tidy`
 HeaderFilterRegex: '^(?!.*(?:/|^)(3rdparty|tvm)/).*'
+# ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
 
 # NOTE: there must be no spaces before the '-', so put the comma last.
 Checks: >-
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0ce..0086358db 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1 @@
-blank_issues_enabled: false
+blank_issues_enabled: true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a475cd513..e939127cb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,23 +40,13 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
 
-      - name: Setup Python 3.8
-        id: setup-pylowest
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.8" # use lowest supported version for linting
-          update-environment: false
-
-      - name: Check AST with Python 3.8
-        run: |
-          "${{ steps.setup-pylowest.outputs.python-path }}" -m compileall -q -f tilelang
-
       - name: Setup Python 3.9
+        id: setup-pylowest
         uses: actions/setup-python@v6
         with:
           python-version: "3.9"
@@ -67,6 +57,10 @@ jobs:
             requirements*.txt
             .pre-commit-config.yaml
 
+      - name: Check AST with Python 3.9
+        run: |
+          "${{ steps.setup-pylowest.outputs.python-path }}" -m compileall -q -f tilelang
+
       - name: Pre-commit Lint
         run: |
           if ! pipx run pre-commit run --all-files --color=always --show-diff-on-failure; then
@@ -93,7 +87,7 @@ jobs:
             name: self-hosted-amd
             # Format: [Nightly-]ROCm-<major>.<minor>[.<patch>]. E.g., "ROCm-6.4" or "Nightly-ROCm-7.0".
             # Use "Nightly-" prefix to use torch nightly builds.
-            toolkit: ROCm-6.3
+            toolkit: Nightly-ROCm-7.1
           - tags: [macos-latest]
             name: macos-latest
             toolkit: Metal # or Nightly-Metal
@@ -104,7 +98,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
@@ -288,35 +282,59 @@ jobs:
           echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
           uv cache clean
 
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Install project (wheel form)
+        run: |
+          uv pip install -v .
+
       - name: Run clang-tidy
         id: clang-tidy
         if: runner.os == 'Linux'
         run: |
           echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version
 
-          if [[ -x "$(command -v run-clang-tidy)" ]]; then
-            echo "Using run-clang-tidy from $(command -v run-clang-tidy)"
-            CLANG_TIDY=(run-clang-tidy)
-          else
-            RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
-            echo "Downloading run-clang-tidy script from ${RCT_URL}"
-            echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
-            CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
-          fi
+          # Download run-clang-tidy script
+          RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+          echo "Downloading run-clang-tidy script from ${RCT_URL}"
+          echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
+          RUN_CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
+
           if [[ -x "$(command -v clang-apply-replacements)" ]]; then
             echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)"
-            CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
+            RUN_CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
           else
             echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled."
           fi
 
           # Run cmake to create the build directory with compile_commands.json
           cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
+          echo "::group::compile_commands.json"
+          ls -alh cmake-build/compile_commands.json
+          uv run --no-project -m -- json.tool --no-ensure-ascii cmake-build/compile_commands.json
+          echo "::endgroup::"
 
           CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h")
           rc=0
-          "${CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+          echo "::group::run-clang-tidy"
+          "${RUN_CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+            -exclude-header-filter='^(3rdparty|tvm)/.*$' \
             -p="cmake-build" ${CXX_FILES} || rc="$?"
+          echo "::endgroup::"
           rm -rf cmake-build run-clang-tidy.py
           if (( rc != 0 )); then
             echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them."
@@ -324,26 +342,6 @@ jobs:
             exit "${rc}"
           fi
 
-      - name: Enable core dump generation (Linux / GitHub-hosted runners)
-        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kernel.core_uses_pid=0
-          sudo sysctl -w fs.suid_dumpable=1
-          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
-
-      - name: Enable core dump generation (macOS / GitHub-hosted runners)
-        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kern.coredump=1
-          sudo sysctl -w kern.sugid_coredump=1
-          sysctl kern.corefile kern.coredump kern.sugid_coredump
-
-      - name: Install project (wheel form)
-        run: |
-          uv pip install -v .
-
       - name: Run examples with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         if: contains(matrix.runner.toolkit, 'CUDA')
         run: |
@@ -369,6 +367,7 @@ jobs:
             ./python
 
       # AMD ROCm tests
+      # runtime and transform tests needs to repair, then rm it from ignore list
       - name: Run ROCm tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         id: rocm-tests
         if: contains(matrix.runner.toolkit, 'ROCm')
@@ -379,7 +378,8 @@ jobs:
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
           "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
-            ./python/amd/test_tilelang_test_amd.py
+            --ignore=./python/runtime --ignore=./python/transform \
+            ./python
 
       # Apple Metal tests
       - name: Run Metal tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 0ba3fbc30..dcfdcff14 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,5 +1,6 @@
 name: Dist
 on:
+  workflow_dispatch:
   schedule:
     # gemini said this is 6:00 china time
     - cron: "0 22 * * *"
@@ -17,6 +18,9 @@ on:
       - CMakeLists.txt
       - version_provider.py
       - .github/workflows/dist.yml
+      # temporarily add to dist check
+      # until we have type checking in ci / move to python 3.10
+      - tilelang/_typing.py
   release:
     types:
       - published
@@ -34,6 +38,11 @@ env:
   COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
+  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
+  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
 
 jobs:
   build-sdist:
@@ -52,7 +61,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
@@ -71,6 +80,7 @@ jobs:
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1
         with:
+          max-size: "200MB"
           create-symlink: true
           evict-old-files: "7d"
           append-timestamp: false
@@ -91,7 +101,7 @@ jobs:
       - name: Upload SDist
         # Not PR to save artifact storage, as SDist is only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: sdist
           path: dist/*.tar.gz
@@ -105,24 +115,25 @@ jobs:
     strategy:
       matrix:
         target:
-          - { runner: ubuntu-latest, toolkit: "CUDA-12.1" }
+          - { runner: ubuntu-latest, toolkit: "CUDA-12.8" }
           - { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
+          - { runner: ubuntu-latest, toolkit: "Nightly-CUDA-13.0" }
+          - { runner: ubuntu-24.04-arm, toolkit: "Nightly-CUDA-13.0" }
           - { runner: macos-latest, toolkit: "Metal" }
         python-version:
           # Wheels are built with Python 3.8 Limited API, they should work with all Python >= 3.8.
           # Only build wheels against Python 3.8 Limited API to save CI resources.
-          # FIXME: Here we use Python 3.9 because our dependency `apache-tvm-ffi` claims to support
-          #        Python 3.8 but it depends on a version of `ml-dtypes` that requires Python >= 3.9.
           - "3.9"
       fail-fast: false
     timeout-minutes: 120
     runs-on: ${{ matrix.target.runner }}
     env:
-      NO_VERSION_LABEL: ${{ github.event_name == 'release' && 'OFF' || 'ON' }}
+      IS_RELEASE: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]') }}
+      NO_VERSION_LABEL: "OFF"
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
@@ -130,16 +141,14 @@ jobs:
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1
         with:
+          max-size: "200MB"
           create-symlink: true
           evict-old-files: "7d"
           append-timestamp: false
-          key: wheel-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}-${{ hashFiles('**/*.cc') }}
+          key: wheel-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/*.cc') }}
           restore-keys: |
-            wheel-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}-${{ hashFiles('**/*.cc') }}
-            wheel-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
+            wheel-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/*.cc') }}
             wheel-${{ runner.os }}-${{ runner.arch }}
-            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
-            ${{ runner.os }}-${{ runner.arch }}
 
       - name: Set CIBW_BUILD
         run: |
@@ -150,26 +159,77 @@ jobs:
 
           if [[ "${{ matrix.target.toolkit }}" == *"CUDA"* ]]; then
             CUDA_VERSION="${{ matrix.target.toolkit }}"
-            CUDA_VERSION="${CUDA_VERSION#CUDA-}"
+            CUDA_VERSION="${CUDA_VERSION##*-}"
+            CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+            CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
             echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+            if [[ "${{ matrix.target.toolkit }}" == "Nightly-"* ]]; then
+              # Use torch nightly builds
+              export UV_INDEX="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+            else
+              export UV_INDEX="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+              echo "UV_TORCH_BACKEND=cu${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+            fi
+            echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          fi
+
+          if [[ "${{ env.IS_RELEASE }}" == "true" ]]; then
+            if [[ "${{ matrix.target.toolkit }}" == "Nightly-"* ]]; then
+              # Avoid using same file name for different toolkit.
+              echo "NO_GIT_VERSION=ON" | tee -a "${GITHUB_ENV}"
+            else
+              echo "NO_VERSION_LABEL=ON" | tee -a "${GITHUB_ENV}"
+            fi
           fi
 
           if [[ "${{ runner.os }}" == "Linux" ]]; then
             HOST_CCACHE_DIR="$(ccache --get-config cache_dir)"
-            echo "CIBW_BEFORE_BUILD_LINUX=yum install -y ccache && ccache -o cache_dir=/host${HOST_CCACHE_DIR}" | tee -a "${GITHUB_ENV}"
+            echo "CIBW_BEFORE_BUILD_LINUX=dnf install -y ccache && ccache -o cache_dir=/host${HOST_CCACHE_DIR}" | tee -a "${GITHUB_ENV}"
           fi
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.2
+        uses: pypa/cibuildwheel@v3.3
         with:
           package-dir: .
           output-dir: wheelhouse
           config-file: "{package}/pyproject.toml"
 
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.12"
+          activate-environment: true
+
+      - name: Test built wheels
+        run: |
+          for WHEEL in wheelhouse/*.whl; do
+            echo "Testing wheel: ${WHEEL}"
+            (
+              set -e
+              uv venv --python=3.12 test-venv
+              source test-venv/bin/activate
+
+              uv pip install --upgrade pip setuptools wheel
+              if [[ "${UV_INDEX}" == *"/nightly/"* ]]; then
+                uv pip install --prerelease=allow -v torch
+              fi
+
+              uv pip install -v "${WHEEL}"
+              (
+                set -e
+                cd /
+                uv run --no-project -- python -c "import tilelang; print(tilelang.__version__)"
+              )
+              deactivate
+              rm -rf test-venv
+            )
+          done
+
       - name: Upload wheels
         # Not PR to save artifact storage, as wheels are only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: wheels-${{ matrix.python-version }}-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
           path: wheelhouse/*.whl
@@ -184,7 +244,7 @@ jobs:
     timeout-minutes: 15
     steps:
       - name: Download built SDist
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           # unpacks default artifact into dist/
           # if `name: artifact` is omitted, the action will create extra parent dir
@@ -192,7 +252,7 @@ jobs:
           path: dist
 
       - name: Download built wheels
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: wheels-*
           path: dist
@@ -202,7 +262,7 @@ jobs:
         run: ls -lh dist/*
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts
           path: dist/*
diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
deleted file mode 100644
index 37da4e3c8..000000000
--- a/.github/workflows/pr-perfbench-bot.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-name: Performance Benchmark Bot
-
-on:
-  issue_comment:
-    types:
-      - created
-
-permissions:
-  contents: read
-
-concurrency:
-  group: "${{ github.workflow }}-${{ github.ref }}"
-  cancel-in-progress: true # always cancel in-progress
-
-env:
-  PYTHONDEVMODE: "1"
-  PYTHONUNBUFFERED: "1"
-  PYTHONPATH: "" # explicit cleanup
-  PIP_USER: "" # explicit cleanup
-  COLUMNS: "100"
-  FORCE_COLOR: "1"
-  CLICOLOR_FORCE: "1"
-  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
-  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
-
-jobs:
-  perfbench:
-    name: Benchmark between PR and main
-    if: |
-      github.repository_owner == 'tile-ai' &&
-      github.event.issue.pull_request &&
-      (contains(github.event.comment.body, '/performance-report') || contains(github.event.comment.body, '/perf'))
-    runs-on: [self-hosted, nvidia]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v5
-        with:
-          ref: refs/pull/${{ github.event.issue.number }}/merge
-          fetch-depth: 0
-          submodules: recursive
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.12"
-          update-environment: true
-          cache: pip
-          cache-dependency-path: |
-            pyproject.toml
-            requirements*.txt
-
-      - name: Install merged version
-        run: |
-          python -m venv tll
-          source tll/bin/activate
-          pip install -r requirements-test.txt
-          pip install .
-
-      - name: Install original version
-        run: |
-          echo "Check files to be deleted!"
-          git clean -dxf -e tll/
-          echo "Delete files completed!"
-          git checkout main
-          python -m venv tl
-          source tl/bin/activate
-          pip install -r requirements-test.txt
-          pip install .
-
-      - name: Run performance test
-        id: perfbench
-        run: |
-          source tl/bin/activate
-          python maint/scripts/ci_performance.py
-
-      - name: Post test results as PR comment
-        uses: actions/github-script@v8
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '📊 ​**Performance Test Results** (triggered by @' + context.payload.comment.user.login + '):\n\n' +
-                'Run listed here: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\n' +
-                "${{ steps.perfbench.outputs.stdout }}"
-            })
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
new file mode 100644
index 000000000..568ce8555
--- /dev/null
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -0,0 +1,273 @@
+name: Performance Regression Bot
+
+on:
+  issue_comment:
+    types:
+      - created
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+concurrency:
+  # Use the issue/PR number to differentiate between different PRs
+  group: "${{ github.workflow }}-${{ github.event.issue.number }}"
+  cancel-in-progress: true
+
+env:
+  PYTHONDEVMODE: "1"
+  PYTHONUNBUFFERED: "1"
+  PYTHONPATH: "" # explicit cleanup
+  PIP_USER: "" # explicit cleanup
+  COLUMNS: "100"
+  FORCE_COLOR: "1"
+  CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
+  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
+  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+  PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated
+
+jobs:
+  permissions-check:
+    name: Check bot permissions
+    if: |
+      github.repository_owner == 'tile-ai' &&
+      github.event.issue.pull_request &&
+      (contains(github.event.comment.body, '@regression-perf'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get commenter permission
+        id: perm
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const username = context.payload.comment.user.login
+            const { owner, repo } = context.repo
+            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
+            core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
+
+      - name: Reject if not allowed
+        if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
+        run: |
+          echo "Not authorized: permission=${{ steps.perm.outputs.permission }}"
+          exit 1
+
+  pr-regression:
+    name: Performance regression test between PR and main
+    needs: [permissions-check]
+    runs-on: ${{ matrix.runner.tags }}
+    strategy:
+      matrix:
+        runner:
+          - tags: [self-hosted, nvidia]
+            name: self-hosted-nvidia
+            toolkit: CUDA-12.8
+        python-version:
+          - "3.12"
+      fail-fast: false
+    timeout-minutes: 120
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          ref: refs/pull/${{ github.event.issue.number }}/merge
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set environment (self-hosted runners)
+        if: startsWith(matrix.runner.name, 'self-hosted')
+        run: |
+          # Hide sensitive data in logs for self-hosted runners
+          if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then
+            echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}"
+            # Colon separated list of secrets to mask
+            for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do
+              echo "::add-mask::${secret}"
+            done
+          fi
+
+          # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection
+          # issues and to share cache between jobs.
+          export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}"
+          echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}"
+          echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}"
+          echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}"
+
+      # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow.
+      # Self-hosted runners usually have more CPU power to compile without ccache.
+      - name: Setup ccache (GitHub-hosted runners)
+        id: setup-ccache
+        if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          evict-old-files: "7d"
+          append-timestamp: false
+          key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}
+            ${{ runner.os }}-${{ runner.arch }}
+
+      - name: Set environment (CUDA)
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          TOOLKIT="${{ matrix.runner.toolkit }}"
+          CUDA_VERSION="${TOOLKIT##*-}"
+          CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+          CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
+          if [[ "${TOOLKIT}" == "Nightly-"* ]]; then
+            # Use torch nightly builds
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          else
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          fi
+          export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
+          export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON"
+
+          echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}"
+          echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}"
+
+          if [[ ! -x "$(command -v nvcc)" ]]; then
+            export PATH="/usr/local/cuda/bin:${PATH}"
+            export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+            echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}"
+          fi
+          if [[ -x "$(command -v nvcc)" ]]; then
+            echo "\$ $(command -v nvcc) --version" && nvcc --version
+          else
+            echo "::warning::nvcc not found in PATH!"
+          fi
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python-version }}
+          activate-environment: true
+          # Do not use cache for self-hosted runners, as it will download/upload caches which is slow.
+          enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          # Use runner tool_cache for self-hosted runners
+          cache-local-path: ${{ env.UV_CACHE_DIR }}
+          ignore-nothing-to-cache: true
+          # Extra cache key to upload/download caches on GitHub-hosted runners
+          cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }}
+          cache-dependency-glob: |
+            pyproject.toml
+            requirements*.txt
+
+      - name: Setup environments
+        id: setup-venv
+        run: |
+          set -e
+
+          uv venv --python "${{ matrix.python-version }}" new
+
+          source new/bin/activate
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+
+      - name: Install Main version (Baseline)
+        run: |
+          set -e
+          git clean -dxf -e new/ -e .cache/
+          git checkout main
+          git submodule update --init --recursive
+          uv venv --python "${{ matrix.python-version }}" old
+          source old/bin/activate
+
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+          rm -rf tilelang build
+
+          uv venv --python "${{ matrix.python-version }}" test_regression
+          source test_regression/bin/activate
+          uv pip install -v -r requirements-test.txt
+
+      - name: Clear uv cache for self-hosted runners (if setup failed)
+        if: >-
+          ${{
+            failure() &&
+            startsWith(matrix.runner.name, 'self-hosted') &&
+            (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure')
+          }}
+        run: |
+          echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
+          uv cache clean
+
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Run performance regression test
+        run: |
+          source test_regression/bin/activate
+          OLD_PYTHON=./old/bin/python NEW_PYTHON=./new/bin/python \
+            PERF_REGRESSION_MD=regression_result.md PERF_REGRESSION_PNG=regression_result.png \
+            python ./maint/scripts/test_perf_regression.py
+
+      - name: Read markdown table
+        id: read_md
+        run: |
+          echo "content<<EOF" >> $GITHUB_OUTPUT
+          cat regression_result.md >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Upload result image as artifact
+        uses: actions/upload-artifact@v6
+        with:
+          name: perf-regression-${{ github.run_id }}
+          path: regression_result.png
+
+      - name: Post test results as PR comment
+        uses: actions/github-script@v8
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            // Read the file directly instead of passing via env/outputs to avoid escaping issues
+            const md = fs.readFileSync('regression_result.md', 'utf8');
+
+            const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
+
+            const body =
+              'Performance Regression Test Report\n' +
+              '============================\n\n' +
+              `Triggered by: @${context.payload.comment.user.login}\n` +
+              `Workflow run: ${runUrl}\n\n` +
+              'Results\n' +
+              '-------\n\n' +
+              md + '\n\n' +
+              'Artifacts\n' +
+              '---------\n\n' +
+              '- regression_result.png (speedup plot) is attached as a workflow artifact. Download it from the workflow run page above.\n';
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body
+            });
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 953303102..2197015b6 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
diff --git a/.gitignore b/.gitignore
index 752f6cb76..727b6a14e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,18 @@ cmake-build-*/
 
 # pre-commit cache
 .pre-commit-cache/*
+
+# host checks logs
+maint/host_checks/logs/*
+
+# ncu
+*.ncu-rep
+
+# csv
+*.csv
+
+# clang-tidy
+/run-clang-tidy.py
+
+# perf regression test
+.perf_regression/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 615f173b9..a99de631d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,19 +9,17 @@ default_stages: [pre-commit, pre-push, manual]
 exclude: '^(build|3rdparty)/.*$'  # exclude build and 3rdparty directories
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v6.0.0
+    rev: v6.0.0 # May not sync with requirements-lint.txt, but it's OK for now
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
-      # FIXME: enable these hooks
-      # - id: trailing-whitespace
-      # - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
       - id: check-added-large-files
       - id: check-merge-conflict
         fail_fast: true
-      # FIXME: enable these hooks
-      # - id: check-executables-have-shebangs
-      # - id: check-shebang-scripts-are-executable
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
       - id: detect-private-key
       - id: check-yaml
       - id: check-toml
@@ -32,30 +30,17 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.2  # sync with requirements-lint.txt
+    rev: v21.1.8  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
-        exclude: |
-          (?ix)(
-            ^.+\.(cu|cuh)$|
-            ^.+\.json$
-          )
+        types_or: [c++, c]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.3  # sync with requirements-lint.txt
+    rev: v0.14.14  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/google/yapf
-    rev: v0.43.0  # sync with requirements-lint.txt
-    hooks:
-      - id: yapf
-        name: yapf-multiproc-bugfix
-        # yapf is not multiprocess safe, so we run a dummy yapf first.
-        args: [--in-place, docs/conf.py]
-        always_run: true
-        pass_filenames: false
-      - id: yapf
-        args: [--recursive, --in-place]
+      - id: ruff-format
+        args: [--exit-non-zero-on-format]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.4.1  # sync with requirements-lint.txt
     hooks:
@@ -67,3 +52,8 @@ repos:
             ^.+\.svg$|
             ^.*\brequirements\b.*\.txt$
           )
+  - repo: https://github.com/jackdewinter/pymarkdown
+    rev: v0.9.35
+    hooks:
+      - id: pymarkdown
+        args: ["--config", ".pymarkdown", "fix"]
diff --git a/.pymarkdown b/.pymarkdown
new file mode 100644
index 000000000..5394265ed
--- /dev/null
+++ b/.pymarkdown
@@ -0,0 +1,37 @@
+{
+  "plugins": {
+    "md003": {
+      "style": "atx"
+    },
+    "md004": {
+      "style": "dash"
+    },
+    "md013": {
+      "enabled": false
+    },
+    "md026": {
+      "enabled": false
+    },
+    "md029": {
+      "enabled": false
+    },
+    "md031": {
+      "enabled": false
+    },
+    "md032": {
+      "enabled": false
+    },
+    "md033": {
+      "enabled": false
+    },
+    "md034": {
+      "enabled": false
+    },
+    "md040": {
+      "enabled": false
+    },
+    "md041": {
+      "enabled": false
+    }
+  }
+}
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 1c45ca35d..b38bb492a 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 1c45ca35dd5c215e0c1db1f40f01556f467f52a8
+Subproject commit b38bb492a1a55b5abb0c345962143c0f9c482cfb
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 1815c3e0b..8d494caca 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 1815c3e0b6ec4ead36370bbd1562025d8529017c
+Subproject commit 8d494cacae52b2ec73f2717431190b1ecd5df6ce
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72e1d9795..4e520dbcb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,14 +136,21 @@ file(GLOB TILE_LANG_SRCS
   src/*.cc
   src/layout/*.cc
   src/transform/*.cc
+  src/transform/common/*.cc
   src/op/*.cc
   src/target/utils.cc
+  src/target/codegen_c_host.cc
   src/target/codegen_cpp.cc
   src/target/rt_mod_cpp.cc
   # intrin_rule doesn't have system dependency
   src/target/intrin_rule*.cc
 )
 
+# Always include CPU-safe runtime helpers
+list(APPEND TILE_LANG_SRCS
+  src/runtime/error_helpers.cc
+)
+
 # Track if the user explicitly selected a backend via cache options.
 set(TILELANG_BACKEND_USER_SELECTED OFF)
 foreach(BACKEND IN LISTS TILELANG_BACKENDS)
@@ -204,17 +211,55 @@ elseif(USE_CUDA)
   # Set `USE_CUDA=/usr/local/cuda-x.y`
   cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
 
+  # ============================================================================
+  # CUDA Driver Stub Library (libcuda_stub.so)
+  # ============================================================================
+  # This library provides drop-in replacements for CUDA driver API functions.
+  # Instead of linking directly against libcuda.so (which would fail on
+  # CPU-only machines), we link against this stub which loads libcuda.so
+  # lazily at runtime on first API call.
+  #
+  # The stub exports global C functions matching the CUDA driver API:
+  #   - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc.
+  # These can be called directly without any wrapper macros.
+  # ============================================================================
+  add_library(cuda_stub SHARED src/target/stubs/cuda.cc)
+  target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+  # Export symbols with visibility="default" when building
+  target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS)
+  # Use dlopen/dlsym for runtime library loading
+  target_link_libraries(cuda_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(cuda_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    # Use consistent naming
+    OUTPUT_NAME "cuda_stub"
+  )
+
   file(GLOB TILE_LANG_CUDA_SRCS
-    src/runtime/*.cc
+    src/runtime/runtime.cc
     src/target/ptx.cc
     src/target/codegen_cuda.cc
+    src/target/codegen_py.cc
+    src/target/codegen_utils.cc
+    src/target/codegen_cutedsl.cc
     src/target/rt_mod_cuda.cc
+    src/target/rt_mod_cutedsl.cc
   )
   list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
 
   list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 
+set(USE_Z3      ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations")
+set(USE_PYPI_Z3 ON CACHE BOOL   "Use Z3 provided by PyPI z3-solver package")
+
+if(USE_Z3 AND USE_PYPI_Z3)
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/pypi-z3")
+  find_package(Z3 REQUIRED)
+endif()
+
 # Include tvm after configs have been populated
 add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 
@@ -222,7 +267,11 @@ add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
+
+# Set debug mode compile definitions
+# Enable the TVM debug option, i.e., TVM_LOG_DEBUG
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message(STATUS "Building TileLang with DEBUG mode")
   target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
 endif()
 
@@ -232,6 +281,18 @@ add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
 target_link_libraries(tilelang_module PUBLIC tvm)
+
+# Place dev build outputs under build/lib for consistency
+set_target_properties(tilelang PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+set_target_properties(tilelang_module PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
@@ -251,26 +312,105 @@ if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
 endif()
 
 python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
-# Install extension into the tilelang package directory
+
+# Ensure dev builds drop the extension into build/lib alongside other shared libs
+set_target_properties(tilelang_cython_wrapper PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+# Install the extension into tilelang/lib inside the wheel
 install(TARGETS tilelang_cython_wrapper
-        LIBRARY DESTINATION tilelang
-        RUNTIME DESTINATION tilelang
-        ARCHIVE DESTINATION tilelang)
+        LIBRARY DESTINATION tilelang/lib
+        RUNTIME DESTINATION tilelang/lib
+        ARCHIVE DESTINATION tilelang/lib)
+
+# Copy libz3.so to build folder to workaround isolated build env issue
+if(USE_Z3 AND USE_PYPI_Z3)
+  get_target_property(Z3_LIBRARY_PATH z3::libz3 IMPORTED_LOCATION)
+  install(FILES "${Z3_LIBRARY_PATH}" DESTINATION "${CMAKE_BINARY_DIR}/lib")
+  if(APPLE)
+    set_target_properties(tvm PROPERTIES BUILD_RPATH "@loader_path")
+  else()
+    set_target_properties(tvm PROPERTIES BUILD_RPATH "\$ORIGIN")
+  endif()
+endif()
+
+set(TILELANG_OUTPUT_TARGETS
+  tilelang
+  tilelang_module
+  tvm
+  tvm_runtime
+)
+
+if(USE_CUDA)
+  # Link against CUDA stub library instead of libcuda.so
+  # This enables lazy loading of libcuda.so at runtime, allowing
+  # `import tilelang` to succeed on CPU-only machines.
+  foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+    target_link_libraries(${target} PUBLIC cuda_stub)
+  endforeach()
+  # Include CUDA stub in output targets for RPATH configuration
+  list(APPEND TILELANG_OUTPUT_TARGETS cuda_stub)
+endif()
+
+unset(PATCHELF_EXECUTABLE CACHE)
 
-# let libtilelang to search tvm/tvm_runtime in same dir
 if(APPLE)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # Some z3 is placed in lib/ and some in bin/, we add both in rpath
+    string(APPEND TILELANG_INSTALL_RPATH ";@loader_path/../../z3/lib;@loader_path/../../z3/bin")
+  endif()
 elseif(UNIX)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib")
+  endif()
+  if(USE_CUDA)
+    string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../nvidia/cu${CUDAToolkit_VERSION_MAJOR}/lib")
+  endif()
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if (NOT PATCHELF_EXECUTABLE)
+    message(STATUS "`patchelf` not found.")
+  endif()
+endif()
+
+# Let libtilelang search for tvm/tvm_runtime in the same directory
+foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+  set_target_properties(${target} PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+  set_target_properties(${target} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  )
+endforeach()
+
+# Exclude libcuda.so to allow importing on a CPU-only machine
+if(USE_CUDA AND PATCHELF_EXECUTABLE)
+  # Run `patchelf` on built libraries to remove libcuda.so dependency.
+  # Use `install(CODE ...)` instead of `add_custom_command(... POST_BUILD ...)`
+  # to avoid race conditions during linking.
+  foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+    install(CODE "
+      execute_process(
+        COMMAND ${PATCHELF_EXECUTABLE} --remove-needed libcuda.so.1 --remove-needed libcuda.so \"$<TARGET_FILE:${target}>\"
+        WORKING_DIRECTORY \"${CMAKE_INSTALL_PREFIX}\"
+        RESULT_VARIABLE patchelf_result
+      )
+      if(patchelf_result EQUAL 0)
+        message(STATUS \"`patchelf` successfully removed dependency `libcuda.so` from $<TARGET_FILE:${target}>\")
+      else()
+        message(WARNING \"`patchelf` failed to remove dependency `libcuda.so` from $<TARGET_FILE:${target}>\")
+      endif()
+    ")
+  endforeach()
 endif()
 
 install(
-  TARGETS tvm tvm_runtime tilelang_module tilelang
+  TARGETS ${TILELANG_OUTPUT_TARGETS}
   LIBRARY DESTINATION tilelang/lib
+  RUNTIME DESTINATION tilelang/lib
+  ARCHIVE DESTINATION tilelang/lib
 )
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 9e380d831..5eba9044a 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -17,23 +17,23 @@ diverse, inclusive, and healthy community.
 Examples of behavior that contributes to a positive environment for our
 community include:
 
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
   and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the overall
+- Focusing on what is best not just for us as individuals, but for the overall
   community
 
 Examples of unacceptable behavior include:
 
-* The use of sexualized language or imagery, and sexual attention or advances of
+- The use of sexualized language or imagery, and sexual attention or advances of
   any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email address,
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email address,
   without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
+- Other conduct which could reasonably be considered inappropriate in a
   professional setting
 
 ## Enforcement Responsibilities
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e4b45e24b..45284e980 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 That would be awesome if you want to contribute something to TileLang!
 
-### Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
+## Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
 
 - [Report Bugs](#report-bugs)
 - [Ask Questions](#ask-questions)
@@ -81,6 +81,8 @@ in the main directory. This installation is removable by:
 python3 -m pip uninstall tilelang
 ```
 
+We also recommend installing TileLang in a more manual way for better control over the build process, by compiling the C++ extensions first and set the `PYTHONPATH`. See [Working from Source via `PYTHONPATH`](https://tilelang.com/get_started/Installation.html#working-from-source-via-pythonpath) for detailed instructions.
+
 ## Lint Check
 
 To check the linting, run:
diff --git a/LICENSE b/LICENSE
index 2122252e9..09dd51c8c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,7 @@
     MIT License
 
     Copyright (c) Tile-AI.
-    **During the period from December 1, 2024, to Mar 14, 2025, this project is 
+    **During the period from December 1, 2024, to Mar 14, 2025, this project is
     subject to additional collaboration terms with Microsoft Corporation.**
 
     Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/README.md b/README.md
index d7cdabee5..30c518e05 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,9 @@
 
 # Tile Language
 [![PyPI version](https://badge.fury.io/py/tilelang.svg)](https://badge.fury.io/py/tilelang)
-[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/tile-ai/tilelang) [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?logo=discord&logoColor=white)](https://discord.gg/TUrHyJnKPG)
-
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/tile-ai/tilelang)
+[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?logo=discord&logoColor=white)](https://discord.gg/TUrHyJnKPG)
+[![Puzzles](https://img.shields.io/badge/🧩_Learn-TileLang_Puzzles-blueviolet)](https://github.com/tile-ai/tilelang-puzzles)
 </div>
 
 Tile Language (**tile-lang**) is a concise domain-specific language designed to streamline the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). By employing a Pythonic syntax with an underlying compiler infrastructure on top of [TVM](https://tvm.apache.org/), tile-lang allows developers to focus on productivity without sacrificing the low-level optimizations necessary for state-of-the-art performance.
@@ -13,6 +14,10 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
 <img src=./images/MatmulExample.png />
 
 ## Latest News
+- 02/02/2026 🧩: Check out [TileLang Puzzles](https://github.com/tile-ai/tilelang-puzzles), a fun and interactive way to learn TileLang programming with 10 progressively harder puzzles!
+- 12/18/2025 🚀: Added [CuTeDSL backend](https://github.com/tile-ai/tilelang/pull/1421) support, enabling compilation to NVIDIA CUTLASS CuTe DSL! Join us in building and optimizing this exciting new backend: [Issue #1454](https://github.com/tile-ai/tilelang/issues/1454).
+- 12/17/2025 🔬: Integrated [Z3 theorem prover](https://github.com/tile-ai/tilelang/pull/1367) into TVM Arith Analyzer, bringing SMT-based symbolic reasoning for enhanced optimizations and automatic correctness verification!
+- 10/31/2025 🔧: Migrated to [apache-tvm-ffi](https://github.com/tile-ai/tilelang/pull/1108), significantly reducing CPU overhead!
 - 10/30/2025 📦: We have released v0.1.6.post2, which is the last version compatible with Python 3.8.
 - 10/07/2025 🍎: Added Apple Metal Device support, check out [Pull Request #799](https://github.com/tile-ai/tilelang/pull/799) for details.
 - 09/29/2025  🎉: Thrilled to announce that ​​AscendC​​ and ​Ascend​NPU IR​​ backends targeting Huawei Ascend chips are now supported!
@@ -21,7 +26,7 @@ Check out the preview here:
 This includes implementations across two branches:
 [ascendc_pto](https://github.com/tile-ai/tilelang-ascend) and
 [npuir](https://github.com/tile-ai/tilelang-ascend/tree/npuir).
-Feel free to explore and share your feedback! 
+Feel free to explore and share your feedback!
 - 07/04/2025 🚀: Introduced `T.gemm_sp` for 2:4 sparse tensor core support, check out [Pull Request #526](https://github.com/tile-ai/tilelang/pull/526) for details.
 - 06/05/2025 ✨: Added [NVRTC Backend](https://github.com/tile-ai/tilelang/pull/461) to significantly reduce compilation time for cute templates!
 - 04/14/2025 🚀: Added high-performance FlashMLA implementation for AMD MI300X, achieving performance parity with hand-optimized assembly kernels of Aiter! See [example_mla_amd](./examples/deepseek_mla/amd/README.md) for details.
@@ -46,7 +51,6 @@ Although tile-lang aims to be portable across a range of Devices, it has been sp
 
 Within the `examples` directory, you will also find additional complex kernels—such as convolutions, forward/backward passes for FlashAttention, more operators will continuously be added.
 
-
 ## Benchmark Summary
 
 TileLang achieves exceptional performance across a variety of computational patterns. Comprehensive benchmark scripts and settings are available at [tilelang-benchmark](https://github.com/tile-ai/tilelang-benchmark). Below are selected results showcasing its capabilities:
@@ -61,7 +65,7 @@ TileLang achieves exceptional performance across a variety of computational patt
       <img src="./examples/deepseek_mla/figures/bs128_float16.png" alt="mla decode performance bs128 on H100" width="100%" />
     </div>
   </div>
-  
+
 - Flash Attention Performance on H100
 
   <div align="center">    <img src="./images/mha_performance_h100.png" alt="operator performance on H100" width=80% />
@@ -106,9 +110,9 @@ pip install -e . -v # remove -e option if you don't want to install in editable
 
 ### Method 2: Build from Source
 We currently provide three ways to install **tile-lang** from source:
- - [Install from Source (using your own TVM installation)](./docs/get_started/Installation.md#method-1-install-from-source-using-your-own-tvm-installation)
- - [Install from Source (using the bundled TVM submodule)](./docs/get_started/Installation.md#method-2-install-from-source-using-the-bundled-tvm-submodule)
- - [Install Using the Provided Script](./docs/get_started/Installation.md#method-3-install-using-the-provided-script)
+- [Install from Source (using your own TVM installation)](./docs/get_started/Installation.md#method-1-install-from-source-using-your-own-tvm-installation)
+- [Install from Source (using the bundled TVM submodule)](./docs/get_started/Installation.md#method-2-install-from-source-using-the-bundled-tvm-submodule)
+- [Install Using the Provided Script](./docs/get_started/Installation.md#method-3-install-using-the-provided-script)
 
 ### Method 3: Install with Nightly Version
 
@@ -130,93 +134,95 @@ In this section, you'll learn how to write and execute a straightforward GEMM (m
 Below is an example that demonstrates more advanced features: layout annotation, parallelized copy, and swizzle for improved L2 cache locality. This snippet shows how to adapt your kernel to maximize performance on complex hardware.
 
 ```python
-import tilelang
-import tilelang.language as T
-
 # @tilelang.jit(target="cuda")
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
-    ):
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            # Enable rasterization for better L2 cache locality (Optional)
-            # T.use_swizzle(panel_size=10, enable=True)
-
-            # Clear local accumulation
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                # Copy tile of A
-                # This is a sugar syntax for parallelized copy
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-
-                # Copy tile of B
-                T.copy(B[ko * block_K, bx * block_N], B_shared)
-
-                # Perform a tile-level GEMM on the shared buffers
-                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
-                T.gemm(A_shared, B_shared, C_local)
-            
-            # relu
-            for i, j in T.Parallel(block_M, block_N):
-                C_local[i, j] = T.max(C_local[i, j], 0)
-
-            # Copy result back to global memory
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return matmul_relu_kernel
-
-
-M = 1024  # M = T.dynamic("m") if you want to use dynamic shape
-N = 1024
-K = 1024
-block_M = 128
-block_N = 128
-block_K = 32
-
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
-matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
-import torch
-
-# Create random input tensors on the GPU
-a = torch.randn(M, K, device="cuda", dtype=torch.float16)
-b = torch.randn(K, N, device="cuda", dtype=torch.float16)
-c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+def matmul_relu(
+    A, B,
+    block_M: int = 64,
+    block_N: int = 64,
+    block_K: int = 64,
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float32,
+):
+    # declare compilation shape constant
+    M, N, K = T.const('M, N, K')
 
-# Run the kernel through the Profiler
-matmul_relu_kernel(a, b, c)
+    # annotate input tensor shape
+    A: T.Tensor[[M, K], dtype]
+    B: T.Tensor[[K, N], dtype]
 
-print(c)
-# Reference multiplication using PyTorch
-ref_c = torch.relu(a @ b)
+    # allocate output tensor
+    C = T.empty([M, N], dtype)
 
-# Validate correctness
-torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-print("Kernel output matches PyTorch reference.")
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
-# 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
-# print("Generated CUDA kernel:\n", cuda_source)
+        # Enable rasterization for better L2 cache locality (Optional)
+        # T.use_swizzle(panel_size=10, enable=True)
 
-# 5.Profile latency with kernel
-profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+        # Clear local accumulation
+        T.clear(C_local)
 
-latency = profiler.do_bench()
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+            # Copy tile of A
+            # This is a sugar syntax for parallelized copy
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+
+            # Copy tile of B
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+            # Perform a tile-level GEMM on the shared buffers
+            # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+            T.gemm(A_shared, B_shared, C_local)
+
+        # relu
+        for i, j in T.Parallel(block_M, block_N):
+            C_local[i, j] = T.max(C_local[i, j], 0)
 
+        # Copy result back to global memory
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+    # You can write multiple cuda kernel in one function, they execute sequentially
+    # with T.Kernel(...) as ...
+
+    # Return the tensor, you can also return multiple tensors
+    return C
+
+
+M, N, K = 1024, 1024, 1024
+
+a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+c_ref = torch.relu(a @ b)
+
+# Call the kernel
+c = matmul_relu(a, b)
+torch.testing.assert_close(c, c_ref, rtol=1e-2, atol=1e-2)
+
+# Call the kernel with overwritten compilation constants
+c = matmul_relu(a, b, block_M=128, block_N=128, block_K=64)
+torch.testing.assert_close(c, c_ref, rtol=1e-2, atol=1e-2)
+
+# Retrieve the compiled kernel
+kernel = matmul_relu.compile(a, b) # use torch.Tensor
+kernel = matmul_relu.compile(      # use T.Tensor as placeholder
+  T.Tensor((M, K), T.float16),
+  T.Tensor((K, N), T.float16)
+)
+kernel = matmul_relu.compile(      # directly specify the shape constants
+  M=M, N=N, K=K,
+  block_M=128, block_N=128, block_K=64
+)
+print(kernel.get_kernel_source())
+c = kernel(a, b)
+
+# 5.Profile latency with kernel
+profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
 ```
 
diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt
index b7c481841..3558662a8 100644
--- a/THIRDPARTYNOTICES.txt
+++ b/THIRDPARTYNOTICES.txt
@@ -1,5 +1,5 @@
-BitBLAS uses third-party material as listed below. The attached notices are 
-provided for informational purposes only. 
+BitBLAS uses third-party material as listed below. The attached notices are
+provided for informational purposes only.
 
 Notice for apache/tvm
 -------------------------------
diff --git a/VERSION b/VERSION
index 5ed6219f4..c5578b40d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.6.post2
+0.1.7.post3
diff --git a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
index 6401276ac..3dd82aa5e 100644
--- a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
@@ -7,10 +7,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -28,15 +25,15 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         import flash_attn
 
diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index aefe4d420..0018e9c93 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -39,16 +36,15 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -60,11 +56,10 @@ def MMA0(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,22 +74,24 @@ def MMA1(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -114,22 +111,21 @@ def Softmax(
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -142,31 +138,29 @@ def main(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    if block_mask[k]:
+                    if block_mask[k] != 0:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -175,26 +169,23 @@ def main(
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
-        program = blocksparse_flashattn(
-            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+        program = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
         kernel = tilelang.compile(program, out_idx=4)
 
         def benchmark_fn():
diff --git a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
index e4828ce5f..85d754ae3 100644
--- a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -31,39 +28,37 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
         def benchmark_fn():
             # Compute reference
             # Expand block mask to full attention matrix
-            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
             full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
             full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
             # PyTorch reference implementation
-            attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-            attn = attn.masked_fill(~full_mask, float('-inf'))
+            attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+            attn = attn.masked_fill(~full_mask, float("-inf"))
             attn = F.softmax(attn, dim=-1)
-            ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+            ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
             return ref_output
 
         ref_latency = do_bench(
diff --git a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
index 86ac894bc..7ebca93a6 100644
--- a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -72,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -153,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -191,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -253,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,24 +253,22 @@ def backward(ctx, do):
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
diff --git a/benchmark/mamba2/README.md b/benchmark/mamba2/README.md
index 0b6de19b1..f0b4b7e80 100644
--- a/benchmark/mamba2/README.md
+++ b/benchmark/mamba2/README.md
@@ -45,7 +45,6 @@ PY
 | 16384 | 2.531    | 135.711                 |
 | 32768 | 5.076    | 135.379                 |
 
-
 ## Compare with Baselines
 
 - Triton: v3.5.0, mamba-ssm: v2.2.6.post3
@@ -56,4 +55,4 @@ PY
     <img src="mamba_benchmark_result.png" alt="Mamba2_chunk_scan Performance Comparison on H100">
    </a>
   <figcaption style="text-align: center;">Performance comparison across compilers on NVIDIA H100</figcaption>
-</figure>
\ No newline at end of file
+</figure>
diff --git a/benchmark/mamba2/benchmark_mamba_chunk_scan.py b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
index aff810f66..55f802b4f 100644
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
@@ -51,14 +51,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -74,7 +75,6 @@ def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
 
 
 def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
-
     @helion.kernel()
     def helion_mamba2_chunk_scan_kernel(
         cb: torch.Tensor,
@@ -118,8 +118,7 @@ def helion_mamba2_chunk_scan_kernel(
 
         dtype = cb.dtype
         accum_dtype = torch.float32
-        assert (x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype ==
-                dtype)
+        assert x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype == dtype
 
         out = torch.empty_like(x)
 
@@ -127,11 +126,10 @@ def helion_mamba2_chunk_scan_kernel(
 
         for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
             [nheads, chunk_size, headdim, batch, nchunks],
-                block_size=[1, block_m, block_n, 1, 1],
+            block_size=[1, block_m, block_n, 1, 1],
         ):
             acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
-            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                          tile_m].to(torch.float32)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_m].to(torch.float32)
             scale_m_local = torch.exp2(dA_cumsum_local_m * p)
 
             C_local = C[
@@ -152,10 +150,8 @@ def helion_mamba2_chunk_scan_kernel(
                     tile_m,
                     tile_k,
                 ]
-                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                              tile_k].to(torch.float32)
-                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p -
-                                       dA_cumsum_local_k[None, :] * p)
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p - dA_cumsum_local_k[None, :] * p)
                 dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
                 cb_local = (cb_local * dt_local[None, :]).to(dtype)
                 pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
@@ -169,11 +165,9 @@ def helion_mamba2_chunk_scan_kernel(
                 acc_o = hl.dot(cb_local, x_local, acc=acc_o)
 
             D_local = D[tile_h.begin].to(torch.float32)
-            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                           tile_n].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n].to(torch.float32)
             acc_o += x_residual * D_local
-            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                tile_n] = acc_o.to(dtype=dtype)
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n] = acc_o.to(dtype=dtype)
 
         return out
 
@@ -182,12 +176,7 @@ def helion_mamba2_chunk_scan_kernel(
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -198,56 +187,58 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
-            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_shared = T.alloc_shared((block_M, block_K), dtype)
             cb_local = T.alloc_fragment((block_M, block_K), dtype)
-            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype)
             dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
             dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
-            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_shared = T.alloc_shared((block_K), dtype)
             dt_local = T.alloc_fragment((block_K), accum_dtype)
-            x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
-            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            x_shared = T.alloc_shared((block_K, block_N), dtype)
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype)
             scale_m_local = T.alloc_fragment((block_M), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
             prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
             D_local = T.alloc_fragment((1), accum_dtype)
-            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype)
             x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             batch_idx = by % batch
@@ -257,27 +248,31 @@ def main(
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -286,34 +281,47 @@ def main(
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -321,24 +329,37 @@ def main(
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     nchunks = math.ceil(seq_len / chunk_size)
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
@@ -360,8 +381,7 @@ def main(
     D = torch.randn(heads).half().cuda()
 
     print("Benchmarking Triton...")
-    triton_latency = do_bench(
-        lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    triton_latency = do_bench(lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
     print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
 
     print("Benchmarking Helion...")
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index c64f4fabf..dca98a676 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -2,10 +2,10 @@
 import itertools
 import logging
 
-import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
+
 # Configure logger
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -61,9 +61,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -101,9 +101,7 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
     return configs
 
 
@@ -112,7 +110,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -154,14 +154,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -176,7 +176,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -188,8 +187,6 @@ def main(
 
             # Enable (or disable) swizzling optimization
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
-            # to utilize swizzle tma layout
-            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
 
             # Clear out the accumulation buffer
             T.clear(C_local)
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index 94e36b385..4ef860c21 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.autotuner import autotune
 import itertools
@@ -48,22 +49,22 @@ def tl_matmul(
     enable_rasteration=False,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     block_M = block_row_warps * warp_row_tiles
@@ -103,12 +104,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
@@ -127,7 +129,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -194,9 +194,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -223,7 +223,6 @@ def get_configs(args, kwargs):
         for config in configs:
             print(config)
     else:
-
         iter_params = dict(
             block_row_warps=[1, 2, 4],
             block_col_warps=[1, 2, 4],
@@ -233,9 +232,7 @@ def get_configs(args, kwargs):
             stage=[0, 2],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -247,14 +244,16 @@ def get_configs(args, kwargs):
     ref_prog=ref_program,
     skip_check=True,
 )
-@tl.jit(out_idx=[2],)
+@tl.jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
     K,
-    in_dtype="float16",
-    out_dtype="float16",
-    accum_dtype="float16",
+    in_dtype=T.float16,
+    out_dtype=T.float16,
+    accum_dtype=T.float16,
     with_roller=False,
     block_row_warps=None,
     block_col_warps=None,
@@ -291,19 +290,14 @@ def kernel():
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--with_roller",
-        type=bool,
-        default=False,
-        help="Whether to use roller to deduce search spaces")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
+    parser.add_argument("--with_roller", type=bool, default=False, help="Whether to use roller to deduce search spaces")
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k
-    in_dtype = args.dtype
-    out_dtype = "float32" if in_dtype == "int8" else "float16"
-    accum_dtype = "float32" if in_dtype == "int8" else "float16"
+    in_dtype = T.dtype(args.dtype)
+    out_dtype = T.float32 if in_dtype == T.int8 else T.float16
+    accum_dtype = T.float32 if in_dtype == T.int8 else T.float16
     with_roller = args.with_roller
     with_roller = True
     # Compute total floating-point operations
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 4e4ed6128..7ecffc26a 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -9,7 +9,7 @@
 from tilelang.autotuner import autotune
 from tilelang import jit
 from tilelang.contrib import nvcc
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 
 # Configure logger
 logger = logging.getLogger(__name__)
@@ -70,7 +70,8 @@ def get_configs(M, N, K):
             thread_num,
             policy,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -81,12 +82,13 @@ def get_configs(M, N, K):
             "thread_num": c[4],
             "policy": c[5],
             "enable_rasterization": c[6],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
 
-def matmul_sp(M, N, K, accum_dtype):
+def matmul_sp(M, N, K, in_dtype, accum_dtype):
     """
     Create an autotuned matrix multiplication kernel for matrices of shape:
       - A: (M, K)
@@ -126,7 +128,9 @@ def matmul_sp(M, N, K, accum_dtype):
         warmup=3,
         rep=20,
     )
-    @jit(out_idx=[2],)
+    @jit(
+        out_idx=[2],
+    )
     def kernel(
         block_M=None,
         block_N=None,
@@ -161,15 +165,14 @@ def kernel(
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
         e_factor, e_dtype = ARCH_INFO[arch]
 
         @T.prim_func
         def main(
-                A_sparse: T.Tensor((M, K // 2), dtype),
-                E: T.Tensor((M, K // e_factor), e_dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), accum_dtype),
+            A_sparse: T.Tensor((M, K // 2), in_dtype),
+            E: T.Tensor((M, K // e_factor), e_dtype),
+            B: T.Tensor((K, N), in_dtype),
+            C: T.Tensor((M, N), accum_dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -183,13 +186,11 @@ def main(
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
-                A_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+                A_shared = T.alloc_shared((block_M, block_K // 2), in_dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
+                B_shared = T.alloc_shared((block_K, block_N), in_dtype)
                 # Allocate shared memory for E sub-block of shape (block_M, block_K // E_factor)
                 E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
                 # Allocate a local fragment for intermediate accumulation
@@ -202,14 +203,12 @@ def main(
                 T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
-                T.annotate_layout({
-                    E:
-                        make_metadata_layout(
-                            E, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                    E_shared:
-                        make_metadata_layout(
-                            E_shared, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
+                    }
+                )
                 # Loop over sub-blocks in K dimension, pipelined by num_stages
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     # Load a sub-block of A from global memory into A_shared
@@ -220,7 +219,7 @@ def main(
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                     # Perform a partial matrix multiplication:
                     #   C_local += A_shared @ B_shared
-                    T.gemm_sp(
+                    T.gemm_sp_v2(
                         A_shared,
                         E_shared,
                         B_shared,
@@ -244,18 +243,13 @@ def main(
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--disable_cache", action="store_true")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument(
         "--bench_torch_sparse",
         type=str,
-        choices=['cutlass', 'cusparselt'],
+        choices=["cutlass", "cusparselt"],
         default=None,
-        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported"
+        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported",
     )
     args = parser.parse_args()
 
@@ -268,7 +262,7 @@ def main(
     total_flops = 2 * M * N * K
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_result = matmul_sp(M, N, K, args.accum_dtype)
+    best_result = matmul_sp(M, N, K, T.float16, args.accum_dtype)
     best_latency = best_result.latency
     best_config = best_result.config
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
@@ -277,7 +271,8 @@ def main(
 
     if args.bench_torch_sparse is not None:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
-        if args.bench_torch_sparse == 'cutlass':
+
+        if args.bench_torch_sparse == "cutlass":
             SparseSemiStructuredTensor._FORCE_CUTLASS = True
         A_sp = to_sparse_semi_structured(A, transposed=False)
         torch_sparse_latency = do_bench(lambda: A_sp @ B)
@@ -288,8 +283,6 @@ def main(
     print(f"Best config: {best_config}")
 
     if args.bench_torch_sparse is not None:
-        print(
-            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
-        )
+        print(f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}")
 
     print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 36b910355..64714b649 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -1,7 +1,7 @@
 import argparse
 import itertools
+import torch
 import logging
-import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
@@ -62,9 +62,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -99,12 +99,11 @@ def get_configs(args, kwargs):
             block_K=[64, 128],
             num_stages=[0, 1, 2, 3],
             thread_num=[128, 256],
+            k_pack=[1, 2],
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -114,7 +113,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -125,6 +126,7 @@ def matmul(
     block_K=None,
     num_stages=None,
     thread_num=None,
+    k_pack=None,
     policy=None,
     enable_rasteration=None,
 ):
@@ -156,14 +158,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz if torch.version.hip is not None else T.float8_e4m3fn
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -178,7 +180,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -190,8 +191,6 @@ def main(
 
             # Enable (or disable) swizzling optimization
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
-            # to utilize swizzle tma layout
-            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
 
             # Clear out the accumulation buffer
             T.clear(C_local)
@@ -210,6 +209,7 @@ def main(
                     C_local,
                     transpose_B=True,
                     policy=policy,
+                    k_pack=k_pack,
                 )
             # Write back the results from C_local to the global memory C
             T.copy(C_local, C_shared)
diff --git a/cmake/load_tvm.cmake b/cmake/load_tvm.cmake
index f013c3ba6..cb21be95f 100644
--- a/cmake/load_tvm.cmake
+++ b/cmake/load_tvm.cmake
@@ -3,12 +3,15 @@
 set(TVM_BUILD_FROM_SOURCE TRUE)
 set(TVM_SOURCE ${CMAKE_SOURCE_DIR}/3rdparty/tvm)
 
-if(DEFINED $ENV{TVM_ROOT})
+if(DEFINED ENV{TVM_ROOT})
   if(EXISTS $ENV{TVM_ROOT}/cmake/config.cmake)
     set(TVM_SOURCE $ENV{TVM_ROOT})
+    message(STATUS "Using TVM_ROOT from environment variable: ${TVM_SOURCE}")
   endif()
 endif()
 
+message(STATUS "Using TVM source: ${TVM_SOURCE}")
+
 set(TVM_INCLUDES
   ${TVM_SOURCE}/include
   ${TVM_SOURCE}/src
diff --git a/cmake/pypi-z3/FindZ3.cmake b/cmake/pypi-z3/FindZ3.cmake
new file mode 100644
index 000000000..d7920f8f9
--- /dev/null
+++ b/cmake/pypi-z3/FindZ3.cmake
@@ -0,0 +1,30 @@
+if(Z3_FOUND)
+    return()
+endif()
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+execute_process(
+    COMMAND "${Python3_EXECUTABLE}" -c "import z3; print(z3.__path__[0])"
+    OUTPUT_VARIABLE Z3_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE Z3_PYTHON_RESULT
+)
+if(NOT Z3_PYTHON_RESULT EQUAL 0 OR Z3_PATH STREQUAL "")
+    message(FATAL_ERROR "Failed to locate z3 Python package. Ensure z3-solver>=4.13.0 is installed.")
+endif()
+message("-- Find Z3 in path: ${Z3_PATH}")
+find_path(Z3_INCLUDE_DIR NO_DEFAULT_PATH NAMES z3++.h PATHS ${Z3_PATH}/include)
+find_library(Z3_LIBRARY NO_DEFAULT_PATH NAMES z3 libz3 PATHS ${Z3_PATH}/bin ${Z3_PATH}/lib ${Z3_PATH}/lib64)
+message("-- Found Z3 include dir: ${Z3_INCLUDE_DIR}")
+message("-- Found Z3 library: ${Z3_LIBRARY}")
+add_library(z3::libz3 SHARED IMPORTED GLOBAL)
+set_target_properties(z3::libz3
+    PROPERTIES
+    IMPORTED_LOCATION ${Z3_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${Z3_INCLUDE_DIR}
+)
+if(NOT Z3_INCLUDE_DIR OR NOT Z3_LIBRARY)
+    message(FATAL_ERROR "Could not find Z3 library or include directory")
+endif()
+set(Z3_CXX_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_C_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_FOUND TRUE)
diff --git a/docker/Dockerfile.cu118 b/docker/Dockerfile.cu118
index 9256fc09b..969b0e43c 100644
--- a/docker/Dockerfile.cu118
+++ b/docker/Dockerfile.cu118
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3 
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 
 WORKDIR /root
 
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu120 b/docker/Dockerfile.cu120
index c89ce82ef..341fe40c0 100644
--- a/docker/Dockerfile.cu120
+++ b/docker/Dockerfile.cu120
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:23.01-py3 
+FROM nvcr.io/nvidia/pytorch:23.01-py3
 
 WORKDIR /root
 
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu121 b/docker/Dockerfile.cu121
index 5b092773d..f91029d75 100644
--- a/docker/Dockerfile.cu121
+++ b/docker/Dockerfile.cu121
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu123 b/docker/Dockerfile.cu123
index 2715536a8..b3d1217fd 100644
--- a/docker/Dockerfile.cu123
+++ b/docker/Dockerfile.cu123
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu124 b/docker/Dockerfile.cu124
index fb9654f48..335f52565 100644
--- a/docker/Dockerfile.cu124
+++ b/docker/Dockerfile.cu124
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu125 b/docker/Dockerfile.cu125
index c409667cb..148e44b41 100644
--- a/docker/Dockerfile.cu125
+++ b/docker/Dockerfile.cu125
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu126 b/docker/Dockerfile.cu126
index 93593b5df..c031c2bc9 100644
--- a/docker/Dockerfile.cu126
+++ b/docker/Dockerfile.cu126
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu128 b/docker/Dockerfile.cu128
index db5e1cb57..2b895ecd8 100644
--- a/docker/Dockerfile.cu128
+++ b/docker/Dockerfile.cu128
@@ -26,6 +26,6 @@ RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev z
 RUN pip install cython
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && cmake -S . -B build -DUSE_CUDA=ON && cmake --build build -j
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1fb23a9f3..5f61f0e2e 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -9,23 +9,43 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential git wget \
   libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  rocm-dev rocm-libs hip-dev hipblas-dev rocblas-dev \
   && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
 ENV PATH="/opt/conda/bin:${PATH}"
 ENV LIBGL_ALWAYS_INDIRECT=1
+ENV USE_ROCM=1
+ENV USE_CUDA=0
+ENV ROCM_HOME=/opt/rocm
+ENV HIP_PLATFORM=amd
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 
 
 RUN conda run -n py_3.10 conda install pip cmake -y && \
     conda run -n py_3.10 conda install -c conda-forge libstdcxx-ng=12 -y && \
     conda clean --all
 
-RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+RUN apt-get update && apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev && \
+    apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
-RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
-    conda run -n py_3.10 bash -c "cd tilelang && ./install_rocm.sh"
+# Copy local tilelang directory instead of cloning from git
+# Build from tilelang root: docker build -f docker/Dockerfile.rocm -t mi300:latest .
+COPY . /root/tilelang
 
-RUN conda init bash
+RUN mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
+    conda run -n py_3.10 bash -c "export USE_ROCM=1 USE_CUDA=0 && pip install 'numpy<2.0' --force-reinstall" && \
+    conda run -n py_3.10 bash -c "cd /root/tilelang && \
+        # Backup and modify pyproject.toml to remove torch from dependencies \
+        cp pyproject.toml pyproject.toml.bak && \
+        sed -i '/^[[:space:]]*\"torch/d' pyproject.toml && \
+        # Install tilelang with all dependencies except torch \
+        USE_ROCM=1 USE_CUDA=0 pip install -e . -v && \
+        # Restore original pyproject.toml \
+        mv pyproject.toml.bak pyproject.toml"
+
+RUN conda init bash && \
+    echo "conda activate py_3.10" >> /root/.bashrc
 
 SHELL ["/bin/bash", "-l", "-c"]
 
-CMD ["bash", "-c", "source ~/.bashrc && conda activate py_3.10 && exec bash"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "--login", "-i"]
diff --git a/docs/.gitignore b/docs/.gitignore
index 4d8eb4049..79ba97163 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,2 +1,2 @@
 _build/
-autoapi/
\ No newline at end of file
+autoapi/
diff --git a/docs/CNAME b/docs/CNAME
index ca903c694..6862cd2e9 100644
--- a/docs/CNAME
+++ b/docs/CNAME
@@ -1 +1 @@
-tilelang.com
\ No newline at end of file
+tilelang.com
diff --git a/docs/README.md b/docs/README.md
index 349c0eccc..896d778d2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -27,4 +27,4 @@ cd _build/html
 python3 -m http.server
 ```
 
-Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above).
+Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending `-p PORT_NUMBER` in the python command above).
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 000000000..a1fee9c3d
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,10 @@
+/* Reduce the displayed size of the sidebar logo in Furo */
+.sidebar-logo {
+  max-height: 125px;
+  width: auto;
+}
+
+/* Optional: keep container from growing too tall due to spacing */
+.sidebar-logo-container {
+  line-height: 0;
+}
diff --git a/docs/_static/img/logo-row.svg b/docs/_static/img/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/docs/_static/img/logo-row.svg
+++ b/docs/_static/img/logo-row.svg
@@ -1 +1 @@
-<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
\ No newline at end of file
+<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
diff --git a/docs/_static/img/logo-v2.png b/docs/_static/img/logo-v2.png
new file mode 100644
index 000000000..410773f60
Binary files /dev/null and b/docs/_static/img/logo-v2.png differ
diff --git a/docs/_static/img/logo.png b/docs/_static/img/logo.png
new file mode 100644
index 000000000..5d04697ce
Binary files /dev/null and b/docs/_static/img/logo.png differ
diff --git a/docs/_static/img/sparse_mma_storage_example.png b/docs/_static/img/sparse_mma_storage_example.png
new file mode 100644
index 000000000..0b1639819
Binary files /dev/null and b/docs/_static/img/sparse_mma_storage_example.png differ
diff --git a/docs/compiler_internals/tensor_checks.md b/docs/compiler_internals/tensor_checks.md
new file mode 100644
index 000000000..ed5a9e691
--- /dev/null
+++ b/docs/compiler_internals/tensor_checks.md
@@ -0,0 +1,386 @@
+# Tensor Checks (Host-Side Auto-Validation)
+
+This page explains the host-side checks that TileLang automatically inserts into the generated host stub for kernels. When you pass `torch.Tensor` or any DLPack-compatible object to a TileLang kernel, the host stub validates argument count, pointer kinds, dtype, shape, strides, device, and more — so you don’t need to handwrite Python checks. This keeps the ABI stable and significantly reduces Python overhead compared to doing equivalent checks in Python or via pybind.
+
+## Why Host-Side Checks
+- ABI stability: the entry is based on TVM FFI + DLPack, consistently accepting tensors and scalars.
+- Lower overhead: shifting checks from Python into C reduces interpreter/property-access costs; the call overhead is lower than pybind-based approaches.
+- Focused error reporting: assertions are raised close to the call site with precise “which field failed” messages.
+
+## How To Inspect Host Source
+You can inspect the auto-generated host source (with all checks and the final device-kernel call) for debugging:
+
+```python
+print(matmul_relu_kernel.get_host_source())
+```
+
+---
+
+## What The Host Checks
+
+### 1) Argument count and pointer kind
+- `num_args` must match the number of formal parameters; otherwise the kernel returns `-1` with an error message.
+- Each argument’s FFI type must be a pointer kind (for DLTensor/handle) or a valid scalar type; otherwise you’ll see errors like `Expect arg[i] to be pointer` or a scalar type error.
+
+### 2) Tensor checks (per tensor, after nullability decision)
+- Nullability
+  - If the tensor is “statically reachable/used” by the function body, the handle must be non-NULL; otherwise: `xxx is expected to have non-NULL pointer`.
+  - If an input tensor is not used by the function (statically unreachable), NULL is allowed; other field checks are executed only when `handle != NULL`.
+- Rank (`ndim`)
+  - Runtime `ndim` must equal the compile-time rank.
+- Data type (`dtype`)
+  - Match the triple `(code, bits, lanes)` with tolerance:
+    - `float8_e4m3`: accept `e4m3`, `e4m3fn`, `e4m3fnuz`.
+    - `float8_e5m2`: accept `e5m2`, `e5m2fnuz`.
+    - `bool`: accept `int8/uint8` with `bits=8` (same lanes), `kDLBool(code=6, bits=1 or 8)`, and any `bitwidth=1` (lanes must match).
+  - For packed-bit dtypes (e.g., `Int(1)`, `Int(4)`, `UInt(4)`), strict dtype checking is skipped.
+- Shape
+  - Each runtime dimension is bound to the compile-time shape (constants or symbols) and checked for consistency.
+  - Linear equations among symbolic dims can be solved on the fly (when there’s only one unknown at a given check point), enabling cross-tensor constraints.
+- Strides
+  - If `buffer_type = AutoBroadcast`: allow `strides == NULL` and derive strides from `shape`. If explicit `strides` is present, bind to compile-time constraints and check for equality.
+  - Otherwise: check per-dimension; if `strides == NULL`, derive from `shape` and compare (e.g., contiguous: `strides[-1] == 1`, `strides[-2] == shape[-1]`).
+- `byte_offset`
+  - Must be 0 (non-zero raises an error) to keep addressing simple and aligned.
+- Device info
+  - Assert `device_type == target backend` (CUDA/ROCM/Metal/OneAPI/WebGPU/CPU, etc.). Error messages include a DLPack code legend.
+  - When multiple tensors participate, assert that `device_id` matches across them.
+- Data pointer
+  - Must be non-NULL when the tensor is required to be non-null by the nullability rule.
+
+### 3) Scalar checks
+- `T.int*` family: require integer; error: `Expect arg[i] to be int`.
+- `T.bool`: require boolean; error: `Expect arg[i] to be boolean`.
+
+---
+
+## Shapes and Symbolic Equations: Linear Solving
+When shapes are symbolic, the host binds and (when possible) solves linear relations at runtime (only one unknown per check point). Example:
+
+```python
+@T.prim_func
+def main(
+    A: T.Tensor((m,), dtype),
+    B: T.Tensor((m + n,), dtype),
+    C: T.Tensor((n * k,), dtype),
+):
+    ...
+```
+
+This enables enforcing cross-tensor relationships like `len(B) == m + n` and `len(C) == n * k` at runtime.
+
+---
+
+## Nullability Rules and Examples
+Which tensors may be NULL?
+
+- Rule: If an input tensor is not used by the function under static analysis (i.e., the access is statically unreachable), it is considered Nullable; otherwise it must be non-NULL.
+- Examples:
+
+1) Must be non-NULL (used)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    A[0] = 1
+```
+Passing `None` raises: `main.A_handle is expected to have non-NULL pointer`.
+
+2) Still must be non-NULL (constant-true branch)
+```python
+some_cond: bool = True
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+3) Nullable (constant-false branch, statically unreachable)
+```python
+some_cond: bool = False
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+4) Must be non-NULL (runtime condition)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype), some_cond: T.bool):
+    if some_cond:
+        A[0] = 1
+```
+Since `some_cond` is only known at runtime, static analysis cannot prove `A` is unused; `A` is thus non-nullable.
+
+---
+
+## Device Type Codes (DLPack)
+Supported and referenced device codes in error messages: `1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, 14=OneAPI, 15=WebGPU`.
+Kernels assert that `device_type` matches the target backend, and require `device_id` consistency across tensors.
+
+---
+
+## Common Error Examples (What you’ll see)
+- Argument count mismatch (num_args)
+  - Trigger: missing/extra argument
+  - Error: `<kernel>: num_args should be N; expected: <num_args>, got: N`
+
+- Pointer-typed argument expected
+  - Trigger: scalar passed where a tensor is expected
+  - Error: `<kernel>: Expect arg[i] to be pointer`
+
+- Rank (ndim) mismatch
+  - Trigger: runtime rank differs from compile-time rank
+  - Error: `<kernel>.<name>.ndim is expected to equal R, but got mismatched ndim`
+
+- Dtype mismatch
+  - Trigger: dtype not equal to the compiled dtype and not within the tolerance set
+  - Error: `<kernel>.<name>.dtype is expected to be <dtype>, but got incompatible dtype`
+
+- Shape constraint violation
+  - Trigger: a dimension doesn’t match a constant/symbol binding
+  - Error: `Argument <kernel>.<name>.shape[i] has an unsatisfied constraint: ... == <expected>`
+
+- Strides check failed (e.g., non-contiguous layout)
+  - Trigger: transposed/sliced tensors that violate expected strides
+  - Error: `Argument <kernel>.<name>.strides[j] has an unsatisfied constraint: ... == <expected>`
+
+- Device type mismatch
+  - Trigger: calling a CUDA kernel with CPU tensors, etc.
+  - Error: `<kernel>.<name>.device_type mismatch [expected: <code> (<name>)] ...`
+
+- Device id mismatch
+  - Trigger: mixing tensors from different GPUs
+  - Error: `Argument <kernel>.<name>.device_id has an unsatisfied constraint: ... == ...`
+
+- NULL data pointer
+  - Trigger: tensor required to be non-null has a NULL data pointer
+  - Error: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`
+
+- Scalar type mismatch
+  - Trigger: passing float to `T.int32`, or non-boolean to `T.bool`
+  - Error: `<kernel>: Expect arg[i] to be int/boolean`
+
+---
+
+## Troubleshooting Tips
+- Print the host source: `print(fn.get_host_source())` to see the exact assertion and expected vs. actual fields.
+- Fix strides: call `.contiguous()` for non-contiguous tensors, or avoid generating transposed/sliced layouts that break assumptions.
+- Align devices: ensure all participating tensors share the same `device_type` and `device_id`.
+- Align dtype: use `.to(<dtype>)` or construct tensors with the correct dtype; pay attention to `float8` and `bool` tolerance.
+- Dynamic shapes: ensure cross-tensor linear relations can be uniquely determined at the check point (only one unknown at a time).
+
+---
+
+## FAQ
+- Can I disable the checks?
+  - Not recommended and usually not supported. Checks are done on the host to preserve ABI stability and fail early close to the device call.
+- Is the overhead noticeable?
+  - The checks are lightweight (branches and field reads). Compared to Python-side checks, it’s faster; the dominating cost remains the Python→C boundary. Overall it’s cheaper than equivalent checks in Python.
+
+---
+
+## Reference Example (Matmul + ReLU)
+
+```python
+@T.prim_func
+def matmul_relu_kernel(
+    A: T.Tensor((M, K), dtype),
+    B: T.Tensor((K, N), dtype),
+    C: T.Tensor((M, N), dtype),
+):
+    # Initialize Kernel Context
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        T.clear(C_local)
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+            T.gemm(A_shared, B_shared, C_local)
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+# For debugging, print the host source
+print(matmul_relu_kernel.get_host_source())
+```
+
+The host will insert all checks described above for this example.
+
+---
+
+## Quick Error Reference (Short List)
+- Argument count
+  - Trigger: missing/extra args; Error: `num_args should be N; expected: <num_args>, got: N`.
+- Pointer kind
+  - Trigger: scalar passed to tensor arg; Error: `Expect arg[i] to be pointer`.
+- Rank (ndim)
+  - Trigger: runtime rank != compile-time; Error: `ndim ... expected to equal R`.
+- Dtype
+  - Trigger: mismatch and not tolerated; Error: `dtype ... expected to be <dtype>`.
+- Shape
+  - Trigger: constant/symbol binding violated; Error: `shape[i] ... == <expected>`.
+- Strides
+  - Trigger: layout mismatch; Error: `strides[j] ... == <expected>`.
+- Device type
+  - Trigger: wrong backend device; Error: `device_type mismatch [expected: ...]`.
+- Device id
+  - Trigger: tensors on different GPUs; Error: `device_id ... == ...`.
+- Data pointer
+  - Trigger: required non-NULL but NULL; Error: `non-NULL data pointer`.
+- Scalar types
+  - Trigger: wrong scalar type; Error: `Expect arg[i] to be int/boolean`.
+
+---
+
+## Host Error Troubleshooting (Minimal Repros)
+
+Below are minimal repro snippets for common host-side errors, assuming a CUDA-targeted kernel like `matmul_relu_kernel` with:
+
+```python
+# Convention:
+# A: float16 [M, K]
+# B: float16 [K, N]
+# C: float16 [M, N]
+# Target: CUDA (device_type=2)
+fn = matmul_relu_kernel  # your compiled function
+M = N = K = 1024
+```
+
+Adjust dtype/device if your kernel differs.
+
+### 0. Tip: print the host source
+```python
+print(fn.get_host_source())
+```
+
+### 1. num_args mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+# Missing C
+fn(A, B)
+```
+Expected: `<kernel>: num_args should be 3; expected: <num_args>, got: 3`.
+
+Fix: pass all arguments per the signature.
+
+### 2. Expect pointer (tensor) but got scalar
+```python
+import torch
+
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(1, B, C)
+```
+Expected: `<kernel>: Expect arg[0] to be pointer`.
+
+Fix: pass a DLPack-compatible tensor (e.g., torch.Tensor).
+
+### 3. ndim mismatch
+```python
+import torch
+
+A = torch.empty((M, K, 1), device='cuda', dtype=torch.float16)  # rank=3
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.ndim is expected to equal 2, but got mismatched ndim`.
+
+Fix: ensure runtime rank equals compiled rank.
+
+### 4. dtype mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float32)  # should be float16
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.dtype is expected to be float16, but got incompatible dtype`.
+
+Fix: `A = A.to(torch.float16)` or create with the correct dtype.
+
+### 5. Shape constant/symbol mismatch
+```python
+import torch
+
+A = torch.empty((M, K + 1), device='cuda', dtype=torch.float16)  # K mismatched
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.A_handle.shape[i] has an unsatisfied constraint: ... == <expected>`.
+
+Fix: satisfy linear constraints and constants across tensors.
+
+### 6. Strides check failure (non-contiguous)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+A_nc = A.t()  # transpose -> non-contiguous
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A_nc, B, C)
+```
+Expected: `Argument <kernel>.A_handle.strides[1] has an unsatisfied constraint: ... == 1`.
+
+Fix: pass `A_nc.contiguous()` or align the layout expectation in the kernel.
+
+### 7. device_type mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cpu', dtype=torch.float16)
+B = torch.empty((K, N), device='cpu', dtype=torch.float16)
+C = torch.empty((M, N), device='cpu', dtype=torch.float16)
+fn(A, B, C)  # CUDA-targeted kernel
+```
+Expected: `<kernel>.A_handle.device_type mismatch [expected: 2 (cuda)] ...`.
+
+Fix: move tensors to the CUDA device.
+
+### 8. device_id mismatch (multi-GPU)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda:0', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda:1', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda:0', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.B_handle.device_id has an unsatisfied constraint: ... == ...`.
+
+Fix: place all tensors on the same GPU (e.g., `cuda:0`).
+
+### 9. NULL data pointer (advanced)
+This usually comes from hand-constructed DLTensor/NDArray, or external frameworks passing unallocated/freed storage. Regular `torch.Tensor` allocations rarely hit this.
+
+Expected: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`.
+
+Fix: ensure valid underlying storage; in PyTorch scenarios, avoid constructing tensors from invalid external handles.
+
+### 10. Scalar type mismatch (int / bool)
+```python
+import tilelang.language as T
+
+@T.prim_func
+def scalar_check(x: T.int32, flag: T.bool()):
+    T.evaluate(0)
+
+scalar_check(1.0, True)  # x is float -> Expect arg[0] to be int
+scalar_check(1, 2.5)     # flag is float -> Expect arg[1] to be boolean
+```
+
+Fix: pass correct scalar types, e.g., `scalar_check(1, True)`.
+
+---
+
+## Closing Notes
+- Cross-check “shape / strides / device / dtype” against the kernel signature to localize issues efficiently.
+- For complex symbolic relations, print the host source to confirm binding/solving order, then adjust runtime shapes/layouts accordingly.
diff --git a/docs/conf.py b/docs/conf.py
index 1b1289038..877b5582e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
 # General information about the project.
-project = "Tile Language <br>"
+project = "TileLang <br>"
 author = "Tile Lang Contributors"
 copyright = f"2025-2025, {author}"
 
@@ -20,33 +20,27 @@
     "autoapi.extension",
 ]
 
-autoapi_type = 'python'
-autoapi_dirs = ['../tilelang']
+autoapi_type = "python"
+autoapi_dirs = ["../tilelang"]
 
 autoapi_options = [
-    'members',
-    'undoc-members',
-    'show-inheritance',
-    'show-module-summary',
-    'special-members',
+    "members",
+    "undoc-members",
+    "show-inheritance",
+    "show-module-summary",
+    "special-members",
 ]
 autoapi_keep_files = False  # Useful for debugging the generated rst files
 
 autoapi_generate_api_docs = True
 
-autodoc_typehints = 'description'
+autodoc_typehints = "description"
 
 autoapi_ignore = ["*language/ast*", "*version*", "*libinfo*", "*parser*"]
 
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 
-myst_enable_extensions = [
-    "colon_fence",
-    "deflist",
-]
+myst_enable_extensions = ["colon_fence", "deflist"]
 
 redirects = {"get_started/try_out": "../index.html#getting-started"}
 
@@ -62,13 +56,11 @@
 html_theme = "furo"
 templates_path = []
 html_static_path = ["_static"]
-footer_copyright = "© 2025-2025 Tile Language"
+html_css_files = ["custom.css"]
+footer_copyright = "© 2025-2026 TileLang"
 footer_note = " "
 
-html_theme_options = {
-    "light_logo": "img/logo-row.svg",
-    "dark_logo": "img/logo-row.svg",
-}
+html_theme_options = {"light_logo": "img/logo-v2.png", "dark_logo": "img/logo-v2.png"}
 
 header_links = [
     ("Home", "https://github.com/tile-ai/tilelang"),
diff --git a/docs/deeplearning_operators/deepseek_mla.md b/docs/deeplearning_operators/deepseek_mla.md
index 08175778f..ed02b58b1 100644
--- a/docs/deeplearning_operators/deepseek_mla.md
+++ b/docs/deeplearning_operators/deepseek_mla.md
@@ -1,8 +1,7 @@
 # 🚀 Write High Performance FlashMLA with TileLang on Hopper
 
-
 <div style="text-align: left;">
-    <em>Author:</em> <a href="https://github.com/chengyupku">Yu Cheng</a> 
+    <em>Author:</em> <a href="https://github.com/chengyupku">Yu Cheng</a>
     <em>Author:</em> <a href="https://github.com/LeiWang1999">Lei Wang</a>
 </div>
 
@@ -32,14 +31,14 @@ Figure 1: Performance under batch size=64
 Figure 2: Performance under batch size=128
 ```
 
-As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton. 
+As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton.
 Notably, **TileLang accomplishes this with just around 80 lines of Python code**, demonstrating its exceptional ease of use and efficiency. Let's dive in and see how TileLang achieves this.
 
 ## Implementation
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]
@@ -62,7 +61,7 @@ Compared to traditional attention operators like MHA (Multi-Headed Attention) or
 
 This raises the question of how to partition the matrix multiplication operation. On the Hopper architecture, most computation kernels use [`wgmma.mma_async`](https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions) instructions for optimal performance. The `wgmma.mma_async` instruction organizes 4 warps (128 threads) into a warpgroup for collective MMA operations. However, `wgmma.mma_async` instructions require a minimum M dimension of 64. This means each warpgroup's minimum M dimension can only be reduced to 64, but a tile size of 64*512 is too large for a single warpgroup, leading to register spilling.
 
-Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input. 
+Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input.
 
 Our solution is to have each warpgroup compute half of `acc_s` during `Q @ K` computation, then obtain the other half computed by the other warpgroup through shared memory.
 
@@ -106,7 +105,6 @@ T.use_swizzle(panel_size: int, order: str = "row")
 
 Here, `panel_size` specifies the width of the swizzled threadblock group, and `order` determines the swizzling pattern, which can be either "row" or "col".
 
-
 ### Shared Memory Swizzling
 
 In CUDA programming, shared memory is divided into multiple memory banks, with each bank capable of servicing one thread request per clock cycle in parallel. Bank conflicts occur when multiple threads simultaneously access different addresses mapped to the same bank, forcing these accesses to be serialized and degrading performance.
@@ -123,17 +121,14 @@ T.annotate_layout({
 
 Here, `T.annotate_layout` allows users to specify any desired layout for a buffer. For convenience, TileLang provides the `make_swizzled_layout` primitive to automatically generate a swizzled layout.
 
-
 ### Warp-Specialization
 
 The Hopper architecture commonly employs warp specialization for performance optimization. A typical approach is to designate one warpgroup as a producer that handles data movement using TMA (Tensor Memory Accelerator), while the remaining warpgroups serve as consumers performing computations. However, this programming pattern is complex, requiring developers to manually manage the execution logic for producers and consumers, including synchronization through the `mbarrier` objects.
 
 In TileLang, users are completely shielded from these implementation details. The frontend script is automatically transformed into a warp-specialized form, where TileLang handles all producer-consumer synchronization automatically, enabling efficient computation.
 
-
 ### Pipeline
 
-
 Pipeline is a technique used to improve memory access efficiency by overlapping memory access and computation. In TileLang, pipeline can be implemented through the `T.pipelined` annotation:
 
 ```python
@@ -142,14 +137,12 @@ T.pipelined(range: int, stage: int)
 
 Here, `range` specifies the range of the pipeline, and `stage` specifies the stage of the pipeline. Multi-stage pipelining enables overlapping of computation and memory access, which can significantly improve performance for memory-intensive operators. However, setting a higher number of stages consumes more shared memory resources, so the optimal configuration needs to be determined based on specific use cases.
 
-
 ### Split-KV
 
 We have also implemented Split-KV optimization similar to [FlashDecoding](https://pytorch.org/blog/flash-decoding/). Specifically, when the batch size is small, parallel SM resources cannot be fully utilized due to low parallelism. In such cases, we can split the kv_ctx dimension across multiple SMs for parallel computation and then merge the results.
 
 In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
 
-
 ## 🚀 On AMD MI300X Accelerators
 
 Following our previous demonstration of [high-performance FlashMLA implementation on NVIDIA Hopper architectures using TileLang](https://github.com/tile-ai/tilelang/blob/main/examples/deepseek_mla/README.md), this work presents an optimized implementation for AMD MI300X accelerators. We examine architectural differences and corresponding optimization strategies between these platforms.
@@ -167,7 +160,7 @@ Key implementation differences between Hopper and MI300X architectures include:
    # Original shared memory allocation
    Q_shared = T.alloc_shared([block_H, dim], dtype)
    Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
-   
+
    # Optimized register allocation
    Q_local = T.alloc_fragment([block_H, dim], dtype)
    Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
index 5e1243c26..6aa8e4085 100644
--- a/docs/deeplearning_operators/elementwise.md
+++ b/docs/deeplearning_operators/elementwise.md
@@ -8,7 +8,7 @@
 :class: myclass1 myclass2
 :name: a-tip-reference
 
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 
@@ -24,7 +24,7 @@ Please note that this tutorial does not delve deeply into the design principles
 ## Elementwise add in TileLang
 
 ```python
-def elementwise_add(N, threads=256, dtype="bfloat16"):
+def elementwise_add(N, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -43,7 +43,7 @@ Those familiar with CUDA programming might wonder where `threadIdx` fits into th
 The program can be compiled using the following code:
 
 ```python
-program = elementwise_add(1024, threads=256, dtype="bfloat16")
+program = elementwise_add(1024, threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 Launching the kernel is straightforward, just call it directly like a function:
@@ -89,7 +89,7 @@ def elementwise_add(
 In the compilation process above, a fixed shape was used. However, in practical usage, we often want the kernel to support dynamic shapes. So, how can we compile a kernel in TileLang to handle dynamic shapes? In TileLang, we can replace the target size with a dynamic symbolic value, making the dimension dynamic. The following example illustrates this:
 
 ```python
-program = elementwise_add(T.dynamic("N"), threads=256, dtype="bfloat16")
+program = elementwise_add(T.dynamic("N"), threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 
@@ -102,7 +102,7 @@ TileLang automatically incorporates boundary-checking conditions; however, this
 When compiling the example below, let's set `N` to 2047:
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -176,7 +176,7 @@ While TileLang incorporates various optimizations for the aforementioned case, i
 In such scenarios, explicitly specifying the number of elements computed per thread can help "guide" TileLang's code generation process, leading to implementations that are more closely aligned with the intended design.
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -212,7 +212,7 @@ Aha, this CUDA code aligns closely with conventional programming practices, maki
 But what happens if we provide additional hints to TileLang? For instance, by explicitly specifying register copies using the `T.copy(...)` operation. The example below demonstrates a vector addition implementation. Unlike the previous examples, this code explicitly loads data into registers before performing computations.
 
 ```python
-def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -280,8 +280,8 @@ To evaluate complexity, one could implement the same elementwise addition operat
 
 ```c++
 template<int NUM_ELE_PER_THREAD=8>
-__global__ void elementwise_add(nv_bfloat16* C, 
-                                 const nv_bfloat16* A, 
+__global__ void elementwise_add(nv_bfloat16* C,
+                                 const nv_bfloat16* A,
                                  const nv_bfloat16* B,
                                  int N) {
   using namespace cute;
diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
index c75a961b8..c2dddf47f 100644
--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -6,7 +6,7 @@
 </div>
 
 :::{warning}
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 
@@ -206,7 +206,6 @@ def splitk_gemv(
     return main
 ```
 
-
 ## Vectorized Reads
 
 GEMV is less computation intensive than GEMM as the computation intensity and memory throughput will be the optimization bottleneck. One effective strategy is to use vectorized load/store operations (e.g., `float2`, `float4`). In `TileLang`, you can specify vectorized operations via `T.vectorized`:
@@ -254,7 +253,6 @@ def splitk_gemv_vectorized(
 
 With vectorized read, now the kernel finishes in **~0.0084 ms**, which is getting close to cuBLAS performance.
 
-
 ## `tvm_thread_allreduce` Instead of `atomicAdd`
 
 [`tvm_thread_allreduce`](https://tvm.apache.org/docs/reference/api/python/tir/tir.html#tvm.tir.tvm_thread_allreduce) has implemented optimization when making an all-reduce across a number of threads, which should outperfrom out plain smem + `atomidAdd`:
@@ -294,7 +292,7 @@ def splitk_gemv_vectorized_tvm(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                    T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                     "reduce_scope",
                     T.reinterpret(T.uint64(0), dtype="handle"),
             ):
@@ -379,7 +377,7 @@ def get_best_config(N, K):
                         C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
                 C_reduced = T.alloc_local((1,), accum_dtype)
                 with T.attr(
-                        T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                        T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                         "reduce_scope",
                         T.reinterpret(T.uint64(0), dtype="handle"),
                 ):
@@ -459,6 +457,5 @@ This corresponds closely to our `TileLang` program, with necessary synchronizati
 | splitk_gemv_vectorized | 0.00809 ms |
 | splitk_gemv_vectorized_tvm | 0.00675 ms |
 
-
 Triton Time: 0.0077344514429569244
-In this tutorial, we implemented a simple GEMV kernel and learn that `TileLang` exposes low level control to user such as thread-level programming and CUDA primitives.
\ No newline at end of file
+In this tutorial, we implemented a simple GEMV kernel and learn that `TileLang` exposes low level control to user such as thread-level programming and CUDA primitives.
diff --git a/docs/deeplearning_operators/matmul.md b/docs/deeplearning_operators/matmul.md
index fea036ebe..12189eb8f 100644
--- a/docs/deeplearning_operators/matmul.md
+++ b/docs/deeplearning_operators/matmul.md
@@ -14,11 +14,11 @@
 
 TileLang is a domain-specific language (DSL) designed for writing high-performance GPU kernels. It provides three main levels of abstraction:
 
-* **Level 1:** A user writes pure compute logic without knowledge of or concern for hardware details (e.g., GPU caches, tiling, etc.). The compiler or runtime performs automatic scheduling and optimization. This level is conceptually similar to the idea behind TVM.
+- **Level 1:** A user writes pure compute logic without knowledge of or concern for hardware details (e.g., GPU caches, tiling, etc.). The compiler or runtime performs automatic scheduling and optimization. This level is conceptually similar to the idea behind TVM.
 
-* **Level 2:** A user is aware of GPU architecture concepts—such as shared memory, tiling, and thread blocks—but does not necessarily want to drop down to the lowest level of explicit thread control. This mode is somewhat comparable to Triton's programming model, where you can write tile-level operations and let the compiler do layout inference, pipelining, etc.
+- **Level 2:** A user is aware of GPU architecture concepts—such as shared memory, tiling, and thread blocks—but does not necessarily want to drop down to the lowest level of explicit thread control. This mode is somewhat comparable to Triton's programming model, where you can write tile-level operations and let the compiler do layout inference, pipelining, etc.
 
-* **Level 3:** A user takes full control of thread-level primitives and can write code that is almost as explicit as a hand-written CUDA/HIP kernel. This is useful for performance experts who need to manage every detail, such as PTX inline assembly, explicit thread behavior, etc.
+- **Level 3:** A user takes full control of thread-level primitives and can write code that is almost as explicit as a hand-written CUDA/HIP kernel. This is useful for performance experts who need to manage every detail, such as PTX inline assembly, explicit thread behavior, etc.
 
 ```{figure} ../_static/img/overview.png
 :width: 50%
@@ -52,12 +52,12 @@ While Level 1 in TileLang can be very comfortable for general users—since it r
 
 Below is a simplified code snippet for a 1024 x 1024 x 1024 matrix multiplication. It uses:
 
-* **`T.Kernel(...)`** to initialize the thread block configuration (grid dimensions, block size, etc.).
-* **`T.alloc_shared(...)`** to allocate GPU shared memory.
-* **`T.alloc_fragment(...)`** to allocate a register fragment for accumulation.
-* **`T.Pipelined(...)`** to express software pipelining across the K dimension.
-* **`T.Parallel(...)`** to parallelize data copy loops.
-* **`T.gemm(...)`** to perform tile-level GEMM operations (which map to the appropriate backends, such as MMA instructions on NVIDIA GPUs).
+- **`T.Kernel(...)`** to initialize the thread block configuration (grid dimensions, block size, etc.).
+- **`T.alloc_shared(...)`** to allocate GPU shared memory.
+- **`T.alloc_fragment(...)`** to allocate a register fragment for accumulation.
+- **`T.Pipelined(...)`** to express software pipelining across the K dimension.
+- **`T.Parallel(...)`** to parallelize data copy loops.
+- **`T.gemm(...)`** to perform tile-level GEMM operations (which map to the appropriate backends, such as MMA instructions on NVIDIA GPUs).
 
 ```python
 import tilelang
@@ -147,14 +147,12 @@ with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx,
 - This sets up the block grid dimensions based on N/block_N and M/block_M.
 - `threads=128` specifies that each thread block uses 128 threads. The compiler will infer how loops map to these threads.
 
-
 ```{figure} ../_static/img/Parallel.png
 :alt: Parallel
 :align: center
 
 ```
 
-
 2. **Shared & Fragment Memory**:
 
 ```python
@@ -182,7 +180,6 @@ for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
 
 ```
 
-
 4. **Parallel Copy**:
 
 ```python
@@ -252,8 +249,8 @@ For more advanced usage—including partial lowering, explicitly controlling thr
 
 ## Further Resources
 
-* [TileLang GitHub](https://github.com/tile-ai/tilelang)
-* [BitBLAS](https://github.com/tile-ai/bitblas)
-* [Triton](https://github.com/openai/triton)
-* [Cutlass](https://github.com/NVIDIA/cutlass)
-* [PyCUDA](https://documen.tician.de/pycuda/)  <!-- codespell:ignore -->
+- [TileLang GitHub](https://github.com/tile-ai/tilelang)
+- [BitBLAS](https://github.com/tile-ai/bitblas)
+- [Triton](https://github.com/openai/triton)
+- [Cutlass](https://github.com/NVIDIA/cutlass)
+- [PyCUDA](https://documen.tician.de/pycuda/)  <!-- codespell:ignore -->
diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
new file mode 100644
index 000000000..8caa6182f
--- /dev/null
+++ b/docs/deeplearning_operators/matmul_sparse.md
@@ -0,0 +1,261 @@
+# Sparse Matrix-Matrix Multiplication with Tile Library
+
+<div style="text-align: left;">
+    <em>Author:</em> <a href="https://github.com/botbw">botbw</a>
+</div>
+
+:::{warning}
+   This document is still **experimental** and may be incomplete.
+
+   This feature is still **experimental** and need further optimization.
+
+   Suggestions and improvements are highly encouraged—please submit a PR!
+:::
+
+:::{tip}
+It's suggested to go through `docs/deeplearning_operators/matmul.md` first.
+
+Example code can be found at `examples/gemm_sp`.
+:::
+
+## Structured sparsity in the NVIDIA Ampere architecture
+
+Since the Ampere architecture (sm80 and above), sparsity support has been integrated into Tensor Cores. This allows a 2:4 (or 1:2 for 32-bit data types) semi-structured matrix to be compressed into its non-zero values along with associated metadata, which can then be fed into the Tensor Core. This enables up to **2x throughput** compared to the equivalent dense computation.
+
+:::{warning}
+   This tutorial primarily focuses on CUDA, as this feature is not yet supported on ROCm. However, AMD provides a similar capability in the matrix cores of GPUs such as the MI300X.
+:::
+
+```{figure} ../_static/img/sparse_mma_storage_example.png
+:align: center
+
+Figure: Sparse MMA storage example (from PTX doc)
+```
+
+## Compress a dense tensor
+
+To utilize sparse Tensor Cores, a dense tensor must first be **compressed** into its non-zero values along with the corresponding metadata.
+
+Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see references [here](https://github.com/pytorch/pytorch/blob/a8d6afb511a69687bbb2b7e88a3cf67917e1697e/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu#L47) and [here](https://github.com/vllm-project/vllm/blob/a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh#L116)), leveraging `CUTLASS`’s built-in compressor (or reimplementing it in `PyTorch`).
+
+A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
+
+```python
+from tilelang.utils.sparse import compress
+A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
+```
+
+Here, `A_sparse` contains all the non-zero elements of `A`, while `E` stores the corresponding metadata (indexing information) required to reconstruct the original sparse pattern.
+
+> NOTE: When using CUTLASS compressor, there is no naive position correspondence between the positions in `A_sparse`/`A` and `E`. (i.e. the 4-element group at [n, k] doesn't match the 4-bit metadata at [n, k] if you consider metadata as int4 tensor)
+The metadata is reordered internally to optimize memory access patterns (e.g., for ldsm instructions and vectorized loads).
+For more information, see **A note on `gemm_sp` and `gemm_sp_v2`**.
+
+## `T.gemm_sp` with CUTLASS's compressor
+
+:::{warning}
+
+It is strongly recommended to use T.gemm_sp_v2 due to its greater flexibility and faster compilation time.
+
+:::
+
+A 2:4 sparse GEMM kernel is similar to its dense counterpart, except that it also requires handling the associated metadata.
+
+Check comments in below kernel code for required modification.
+
+```python
+def matmul_sp_sm80(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+    trans_A,
+    trans_B,
+):
+    is_8_bit = "8" in in_dtype
+    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]  # Calculate shape for given datatypes
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (K, N) if not trans_B else (N, K)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)  # Allocate smem for metadata
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({  # Annotate reordered cutlass metadata layout
+                E:
+                    make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared:
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)  # Call gemm_sp with non-zero values and metadata
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+```
+
+Under the hood, `gemm_sp` invokes templates adapted from `CUTLASS`, and a compatible metadata layout must be specified using `T.annotate_layout`.
+
+## `T.gemm_sp_v2` with a custom compressor
+
+To migrate to `gemm_sp_v2`, simply replace occurrences of `gemm_sp`.
+
+Unlike `gemm_sp`, `gemm_sp_v2` can operate without `T.annotate_layout`, and it also supports user-defined layouts and compressors.
+
+The metadata is stored in a `(u)int8`/`(u)int16`/`(u)int32` tensor, where **each 4-bit chunk represents two 2-bit indices** of non-zero elements within four consecutive elements. Here, we start with an `int16` example, which is the **default dtype** for `bf16` and `fp16` on Ampere GPUs.
+
+Suppose we have the following row vector:
+```python
+t = tensor([[0, 7, 0, 3], [1, 5, 0, 0], [0, 0, 2, 4], [9, 0, 9, 0]], dtype=torch.float16).flatten()
+```
+
+The non-zero elements and their corresponding indices are:
+
+```python
+t_sp = tensor([[7, 3], [1, 5], [2, 4], [9, 9]], dtype=torch.float16).flatten()
+indices = tensor([[1, 3], [0, 1], [2, 3], [0, 2]], dtype=torch.float16).flatten()
+```
+
+The corresponding uint16 metadata is:
+```python
+# metadata_bits = tensor([0b1101, 0b0100, 0b1110, 0b1000])
+# Note: storage uses little-endian order: tensor(0b1000111001001101, dtype=torch.int16)
+# Note: the above code is not runnable in python as the interpreter won't take the binary
+#       as 2's complement
+metadata_int16 = tensor(-29107)
+```
+
+You can decode an int16 metadata tensor using the following utility:
+```python
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+```
+
+The compressor can be implement at either `PyTorch`/`NumPy` level or kernel level.
+
+For example, `PyTorch` provides an Ampere compressor [here](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L47-L179). Note that in this implementation, a [permutation](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L173-L175) is applied to match CUTLASS’s metadata layout. If you do not annotate a metadata layout when using `gemm_sp_v2`, your compressor should replicate the same behavior as the PyTorch example—but without using the `_calculate_meta_reordering_scatter_offsets` function.
+
+If you want to use a custom metadata layout in your kernel, one approach is to define the layout in `TileLang` and then apply the same layout to both your compressor kernel and the matmul_sp kernel.
+
+```python
+
+@tilelang.jit(out_idx=[1, 2], pass_configs={
+    tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+})
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:  # NOTE: Make sure compressor metadata layout
+                T.annotate_layout({ # is same with your computation kernel
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared,
+                            mma_dtype="float16",
+                            arch="8.0",
+                            block_k=block_K),
+                })
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            non_zero_cnt = T.alloc_local((1, ), dtype="uint8")
+            non_zero_elt_log_idx = T.alloc_local((elem, ), dtype="uint8")
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    T.clear(non_zero_cnt)
+                    T.clear(non_zero_elt_log_idx)
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+```
+
+## A note on `gemm_sp` and `gemm_sp_v2`
+
+Initially, `T.gemm_sp` followed the same design as `T.gemm`, lowering to a `CUTLASS` template. This inherently requires metadata to be reordered offline following a predetermined layout.
+
+However, fixing a specific layout introduces several potential issues:
+
+1. Painful debugging experience: Debugging a failed kernel becomes difficult due to the reordered indexing, including permutations and swizzling.
+
+2. Limited flexibility: For example, concatenating two compressed tensors, such as `A_sparse_0` and `A_sparse_1`, into a new `A_sparse` makes sense. However, concatenating their metadata `E_0` and `E_1` may not be valid unless the layout allows it mathematically.
+
+3. Alignment requirements: `CUTLASS` enforces strict alignment checks, and many hyperparameter configurations can lead to compilation errors. (For reference, sm8x was implemented in `CUTLASS 2`.)
+
+`T.gemm_sp_v2` was designed to address these limitations, following the approach of `T.gemm_v2`. It lowers directly to PTX, removing the need for a fixed metadata layout.
diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index 3d5c6db9d..b23026d9b 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -8,25 +8,25 @@
 - **Python Version**: >= 3.8
 - **CUDA Version**: 12.0 <= CUDA < 13
 
-The easiest way to install **tile-lang** is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
+The easiest way to install tilelang is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
 
 ```bash
 pip install tilelang
 ```
 
-Alternatively, you may choose to install **tile-lang** using prebuilt packages available on the Release Page:
+Alternatively, you may choose to install tilelang using prebuilt packages available on the Release Page:
 
 ```bash
 pip install tilelang-0.0.0.dev0+ubuntu.20.4.cu120-py3-none-any.whl
 ```
 
-To install the latest version of **tile-lang** from the GitHub repository, you can run the following command:
+To install the latest version of tilelang from the GitHub repository, you can run the following command:
 
 ```bash
 pip install git+https://github.com/tile-ai/tilelang.git
 ```
 
-After installing **tile-lang**, you can verify the installation by running:
+After installing tilelang, you can verify the installation by running:
 
 ```bash
 python -c "import tilelang; print(tilelang.__version__)"
@@ -40,18 +40,18 @@ python -c "import tilelang; print(tilelang.__version__)"
 - **Python Version**: >= 3.8
 - **CUDA Version**: >= 10.0
 
-```bash
-docker run -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.01-py3
-```
+If you prefer Docker, please skip to the [Install Using Docker](#install-using-docker) section. This section focuses on building from source on a native Linux environment.
 
-To build and install **tile-lang** directly from source, follow these steps. This process requires certain pre-requisites from Apache TVM, which can be installed on Ubuntu/Debian-based systems using the following commands:
+First, install the OS-level prerequisites on Ubuntu/Debian-based systems using the following commands:
 
 ```bash
 apt-get update
 apt-get install -y python3 python3-dev python3-setuptools gcc zlib1g-dev build-essential cmake libedit-dev
 ```
 
-After installing the prerequisites, you can clone the **tile-lang** repository and install it using pip:
+Then, clone the tilelang repository and install it using pip. The `-v` flag enables verbose output during the build process.
+
+> **Note**: Use the `--recursive` flag to include necessary submodules. Tilelang currently depends on a customized version of TVM, which is included as a submodule. If you prefer [Building with Existing TVM Installation](#using-existing-tvm), you can skip cloning the TVM submodule (but still need other dependencies).
 
 ```bash
 git clone --recursive https://github.com/tile-ai/tilelang.git
@@ -59,13 +59,19 @@ cd tilelang
 pip install . -v
 ```
 
-If you want to install **tile-lang** in development mode, you can run the following command:
+If you want to install tilelang in development mode, you can use the `-e` flag so that any changes to the Python files will be reflected immediately without reinstallation.
 
 ```bash
 pip install -e . -v
 ```
 
-If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first:
+> **Note**: changes to C++ files require rebuilding the tilelang C++ library. See [Faster Rebuild for Developers](#faster-rebuild-for-developers) below. A default `build` directory will be created if you use `pip install`, so you can also directly run `make` in the `build` directory to rebuild it as [Working from Source via PYTHONPATH](#working-from-source-via-pythonpath) suggested below.
+
+(working-from-source-via-pythonpath)=
+
+### Working from Source via `PYTHONPATH` (Recommended for Developers)
+
+If you prefer to work directly from the source tree via `PYTHONPATH` instead of using pip, make sure the native extension (`libtilelang.so`) is built first:
 
 ```bash
 mkdir -p build
@@ -73,6 +79,14 @@ cd build
 cmake .. -DUSE_CUDA=ON
 make -j
 ```
+
+We also recommend using `ninja` to speed up compilation:
+
+```bash
+cmake .. -DUSE_CUDA=ON -G Ninja
+ninja
+```
+
 Then add the repository root to `PYTHONPATH` before importing `tilelang`, for example:
 
 ```bash
@@ -85,17 +99,23 @@ Some useful CMake options you can toggle while configuring:
 - `-DUSE_ROCM=ON` selects ROCm support when building on AMD GPUs.
 - `-DNO_VERSION_LABEL=ON` disables the backend/git suffix in `tilelang.__version__`.
 
-We currently provide four methods to install **tile-lang**:
+(using-existing-tvm)=
+
+### Building with Customized TVM Path
 
-1. [Install Using Docker](#install-method-1) (Recommended)
-2. [Install from Source (using the bundled TVM submodule)](#install-method-2)
-3. [Install from Source (using your own TVM installation)](#install-method-3)
+If you already have a TVM codebase, use the `TVM_ROOT` environment variable to specify the location of your existing TVM repository when building tilelang:
+
+```bash
+TVM_ROOT=<your-tvm-repo> pip install . -v
+```
 
-(install-method-1)=
+> **Note**: This will still rebuild the TVM-related libraries (stored in `TL_LIBS`). And this method often leads to some path issues. Check `env.py` to see some environment variables which are not set properly.
 
-### Method 1: Install Using Docker (Recommended)
+(install-using-docker)=
 
-For users who prefer a containerized environment with all dependencies pre-configured, **tile-lang** provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems and is the **recommended approach** for most users.
+## Install Using Docker
+
+For users who prefer a containerized environment with all dependencies pre-configured, tilelang provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems.
 
 **Prerequisites:**
 - Docker installed on your system
@@ -142,82 +162,106 @@ docker run -itd \
 - `--name tilelang_b200`: Assigns a name to the container for easy management
 - `/bin/zsh`: Uses zsh as the default shell
 
-4. **Access the Container**:
+4. **Access the Container and Verify Installation**:
 
 ```bash
 docker exec -it tilelang_b200 /bin/zsh
-```
-
-5. **Verify Installation**:
-
-Once inside the container, verify that **tile-lang** is working correctly:
-
-```bash
+# Inside the container:
 python -c "import tilelang; print(tilelang.__version__)"
 ```
 
-You can now run TileLang examples and develop your applications within the containerized environment. The Docker image comes with all necessary dependencies pre-installed, including CUDA toolkit, TVM, and TileLang itself.
+### ROCm container build (gfx942/gfx950)
 
-**Example Usage:**
+If you want a ready-to-use ROCm image that builds TileLang from source, use
+`docker/Dockerfile.rocm`. This is the recommended path for a clean, reproducible
+environment.
 
-After accessing the container, you can run TileLang examples:
+If you are already inside another ROCm container (for example, the `sglang`
+image) and just need to rebuild TileLang in-place, follow the steps below.
 
-```bash
-cd /home/tilelang/examples
-python elementwise/test_example_elementwise.py
-```
-
-This Docker-based installation method provides a complete, isolated environment that works seamlessly on systems with compatible NVIDIA GPUs like the B200, ensuring optimal performance for TileLang applications.
-
-(install-method-2)=
-
-### Method 2: Install from Source (Using the Bundled TVM Submodule)
-
-If you already have a compatible TVM installation, follow these steps:
-
-1. **Clone the Repository**:
+If you are using the `sglang` ROCm container and need to build TileLang in it (for example on MI300 `gfx942` or MI355 `gfx950`), the build requires extra system libraries, Cython, and a valid `llvm-config`. The following steps match the build flow used in `sglang/docker/rocm.Dockerfile`:
 
 ```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
-```
-
-**Note**: Use the `--recursive` flag to include necessary submodules.
-
-2. **Configure Build Options**:
-
-Create a build directory and specify your existing TVM path:
-
-```bash
-pip install . -v
-```
-
-(install-method-3)=
-
-### Method 3: Install from Source (Using Your Own TVM Installation)
-
-If you prefer to use the built-in TVM version, follow these instructions:
-
-1. **Clone the Repository**:
-
-```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
+# Inside the container (as root)
+apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git wget curl ca-certificates gnupg \
+  libgtest-dev libgmock-dev \
+  libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \
+  python3 python3-dev python3-setuptools python3-pip \
+  gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \
+  cmake ninja-build pkg-config libstdc++6 \
+  && rm -rf /var/lib/apt/lists/*
+
+# Prefer the container venv (avoid system pip)
+export PATH="/opt/venv/bin:${PATH}"
+
+# Build GoogleTest static libs (Ubuntu package ships sources only)
+cmake -S /usr/src/googletest -B /tmp/build-gtest -DBUILD_GTEST=ON -DBUILD_GMOCK=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build /tmp/build-gtest -j"$(nproc)"
+cp -v /tmp/build-gtest/lib/*.a /usr/lib/x86_64-linux-gnu/
+rm -rf /tmp/build-gtest
+
+# Keep setuptools < 80 (compat with some base images)
+pip install --upgrade "setuptools>=77.0.3,<80" wheel cmake ninja scikit-build-core
+
+# Locate ROCm llvm-config (install LLVM 18 if missing)
+LLVM_CONFIG_PATH=""
+for p in /opt/rocm/llvm/bin/llvm-config /opt/rocm/llvm-*/bin/llvm-config /opt/rocm-*/llvm*/bin/llvm-config; do
+  if [ -x "$p" ]; then LLVM_CONFIG_PATH="$p"; break; fi
+done
+if [ -z "$LLVM_CONFIG_PATH" ]; then
+  echo "ROCm llvm-config not found; installing LLVM 18..."
+  curl -fsSL https://apt.llvm.org/llvm.sh -o /tmp/llvm.sh
+  chmod +x /tmp/llvm.sh
+  /tmp/llvm.sh 18
+  LLVM_CONFIG_PATH="$(command -v llvm-config-18)"
+  if [ -z "$LLVM_CONFIG_PATH" ]; then
+    echo "ERROR: llvm-config-18 not found after install"
+    exit 1
+  fi
+fi
+export LLVM_CONFIG="$LLVM_CONFIG_PATH"
+export PATH="$(dirname "$LLVM_CONFIG"):/usr/local/bin:${PATH}"
+
+# Optional shim for tools that expect llvm-config-16
+mkdir -p /usr/local/bin
+printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16
+chmod +x /usr/local/bin/llvm-config-16
+
+# TVM Python bits need Cython (for system Python used by the build)
+pip install --no-cache-dir "cython>=0.29.36,<3.0"
+
+# Clone + build TileLang (ROCm)
+# Default location: /opt/tilelang (adjust if you prefer a different path).
+git clone --recursive https://github.com/tile-ai/tilelang.git /opt/tilelang
+cd /opt/tilelang
+git submodule update --init --recursive
+export CMAKE_ARGS="-DUSE_CUDA=OFF -DUSE_ROCM=ON -DROCM_PATH=/opt/rocm -DLLVM_CONFIG=${LLVM_CONFIG}"
+
+# Avoid pulling CUDA wheels / reinstalling torch by skipping dependency resolution.
+# Assume torch is already installed in the container.
+pip install -e . -v --no-build-isolation --no-deps
+
+# Manually install required runtime deps when using --no-deps.
+# Note: skip torch-c-dlpack-ext on ROCm (its wheel expects CUDA libs).
+pip install "apache-tvm-ffi>=0.1.6" "z3-solver>=4.13.0"
+# If you already installed torch-c-dlpack-ext and hit `libtorch_cuda.so` errors:
+# pip uninstall -y torch-c-dlpack-ext
+
+# If you hit Cython compile errors like `PyLong_SHIFT`/`digit` not declared,
+# disable the stable ABI (abi3) for editable builds:
+# export CMAKE_ARGS="-DUSE_CUDA=OFF -DUSE_ROCM=ON -DROCM_PATH=/opt/rocm -DLLVM_CONFIG=${LLVM_CONFIG} -DSKBUILD_SABI_VERSION="
+# pip install -e . -v --no-build-isolation --no-deps
+
+# Verify
+python -c "import tilelang; print(tilelang.__version__)"
 ```
 
-**Note**: Ensure the `--recursive` flag is included to fetch submodules.
-
-2. **Configure Build Options**:
-
-Copy the configuration file and enable the desired backends (e.g., LLVM and CUDA):
-
-```bash
-TVM_ROOT=<your-tvm-repo> pip install . -v
-```
+If you still want to use `pip install -e . -v --no-build-isolation` without `--no-deps`, pip will try to resolve TileLang dependencies and may download CUDA wheels (e.g., `nvidia_cudnn`, `nvidia_nvshmem`) and reinstall `torch`. To avoid that in ROCm containers, keep `--no-deps` and ensure required packages are already installed.
 
 ## Install with Nightly Version
 
-For users who want access to the latest features and improvements before official releases, we provide nightly builds of **tile-lang**.
+For users who want access to the latest features and improvements before official releases, we provide nightly builds of tilelang.
 
 ```bash
 pip install tilelang -f https://tile-ai.github.io/whl/nightly/cu121/
@@ -252,24 +296,28 @@ Set `NO_TOOLCHAIN_VERSION=ON` to disable this.
 
 ### Run-time environment variables
 
-<!-- TODO: tvm -->
+Please refer to the `env.py` file for a full list of supported run-time environment variables.
 
-## IDE Configs
+## Other Tips
 
-Building tilelang locally will automatically `compile_commands.json` file in `build` dir.
+### IDE Configs
+
+Building tilelang locally will automatically generate a `compile_commands.json` file in `build` dir.
 VSCode with clangd and [clangd extension](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) should be able to index that without extra configuration.
 
-## Compile cache
+### Compile Cache
 
-`ccache` will be automatically used if found.
+The default path of the compile cache is `~/.tilelang/cache`. `ccache` will be automatically used if found.
 
-## Repairing wheels
+### Repairing Wheels
 
 If you plan to use your wheel in other environment,
-it's recommend to use auditwheel (on Linux) or delocate (on Darwin)
+it's recommended to use auditwheel (on Linux) or delocate (on Darwin)
 to repair them.
 
-## Faster rebuild for developers
+(faster-rebuild-for-developers)=
+
+### Faster Rebuild for Developers
 
 `pip install` introduces extra [un]packaging and takes ~30 sec to complete,
 even if no source change.
@@ -278,8 +326,17 @@ Developers who needs to recompile frequently could use:
 
 ```bash
 pip install -r requirements-dev.txt
+
+# For first time compilation
 pip install -e . -v --no-build-isolation
 
+# Or manually compile with cmake/ninja. Remember to set PYTHONPATH properly.
+mkdir build
+cd build
+cmake .. -G Ninja
+ninja
+
+# Rebuild when you change the cpp code
 cd build; ninja
 ```
 
diff --git a/docs/get_started/overview.md b/docs/get_started/overview.md
index 18fa9f193..a7c154f31 100644
--- a/docs/get_started/overview.md
+++ b/docs/get_started/overview.md
@@ -15,49 +15,49 @@ Figure 1: High-level overview of the TileLang compilation flow.
 ## Programming Interfaces
 
 1. **Beginner Level (Hardware-Unaware)**
-   - Intended for users who need to write code that is independent of specific hardware details.  
-   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.  
+   - Intended for users who need to write code that is independent of specific hardware details.
+   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.
    - *Note:* This interface is not yet fully implemented.
 
 2. **Developer Level (Hardware-Aware with Tile Library)**
-   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.  
-   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.  
+   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.
+   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.
    - Users at this level can leverage these ready-made primitives without diving into low-level threading details.
 
 3. **Expert Level (Hardware-Aware with Thread Primitives)**
-   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).  
-   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.  
+   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).
+   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.
    - This level grants maximum flexibility for specialized optimizations tailored to specific GPU or multi-core architectures.
 
 ## Compilation Flow
 
-1. **Tile Program**  
+1. **Tile Program**
    A high-level specification of the computation. Depending on the user’s expertise, they may write a purely hardware-unaware tile program or incorporate constructs from the Tile Library or thread primitives.
 
-2. **Tile Program with Tile Library**  
+2. **Tile Program with Tile Library**
    When developers choose from the Tile Library, the original Tile Program is expanded with specialized library calls. These calls encapsulate efficient implementation patterns for different operations.
 
-3. **Tile Program with Thread Primitives**  
+3. **Tile Program with Thread Primitives**
    Expert-level developers can explicitly use low-level threading constructs to hand-optimize data layout, synchronization, and memory usage.
 
-4. **IRModule**  
+4. **IRModule**
    After the program is composed with libraries or thread primitives, it is lowered to an intermediate representation (IR) that captures the necessary hardware details.
 
-5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**  
+5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**
    From the IR, the system generates target-specific source code. This source code is tuned for the desired backends or GPU architectures (e.g., NVIDIA, AMD).
 
-6. **Hardware-Specific Executable/Runtime**  
+6. **Hardware-Specific Executable/Runtime**
    Finally, the generated source is compiled into hardware-specific executables, ready to run on the corresponding devices. The pipeline supports multiple GPU backends and can be extended to additional architectures.
 
 ## Tile-based Programming Model
 
-[Figure 2](#fig-overview-gemm) provides a concise matrix multiplication (GEMM) example in ``TileLang``, 
-illustrating how developers can employ high-level constructs such as tiles, memory placement, pipelining, 
+[Figure 2](#fig-overview-gemm) provides a concise matrix multiplication (GEMM) example in ``TileLang``,
+illustrating how developers can employ high-level constructs such as tiles, memory placement, pipelining,
 and operator calls to manage data movement and computation with fine-grained control.
-In particular, this snippet ([Figure 2](#fig-overview-gemm) (a)) demonstrates how multi-level tiling 
-leverages different memory hierarchies (global, shared, and registers) to optimize bandwidth utilization 
+In particular, this snippet ([Figure 2](#fig-overview-gemm) (a)) demonstrates how multi-level tiling
+leverages different memory hierarchies (global, shared, and registers) to optimize bandwidth utilization
 and reduce latency.
-Overall, [Figure 2](#fig-overview-gemm) (b) showcases how the Python-like syntax of ``TileLang`` 
+Overall, [Figure 2](#fig-overview-gemm) (b) showcases how the Python-like syntax of ``TileLang``
 allows developers to reason about performance-critical optimizations within a user-friendly programming model.
 
 ```{figure} ../_static/img/MatmulExample.png
diff --git a/docs/get_started/targets.md b/docs/get_started/targets.md
index c2b3f2fb5..3a464bd66 100644
--- a/docs/get_started/targets.md
+++ b/docs/get_started/targets.md
@@ -14,6 +14,7 @@ the generated code. The most frequent choices are listed below:
 | --------- | ----------- |
 | `auto` | Detects CUDA → HIP → Metal in that order. Useful when running the same script across machines. |
 | `cuda` | NVIDIA GPUs. Supports options such as `-arch=sm_80`, `-max_num_threads=1024`, etc. |
+| `cutedsl` | NVIDIA CUTLASS/CuTe DSL backend. Requires `nvidia-cutlass-dsl`. `cuda` options can also be applied to this target. |
 | `hip` | AMD GPUs via ROCm. Options like `-mcpu=gfx90a` can be appended. |
 | `metal` | Apple Silicon GPUs (arm64 Macs). |
 | `llvm` | CPU execution; accepts the standard TVM LLVM switches. |
diff --git a/docs/index.md b/docs/index.md
index 5d9a158f8..1c78ea2f6 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,10 @@
 
 [GitHub](https://github.com/tile-ai/tilelang)
 
-Tile Language (tile-lang) is a concise domain-specific language designed to streamline 
-the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). 
-By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM, 
-tile-lang allows developers to focus on productivity without sacrificing the 
+Tile Language (tile-lang) is a concise domain-specific language designed to streamline
+the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention).
+By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM,
+tile-lang allows developers to focus on productivity without sacrificing the
 low-level optimizations necessary for state-of-the-art performance.
 
 :::{toctree}
@@ -17,13 +17,26 @@ get_started/overview
 get_started/targets
 :::
 
-
 :::{toctree}
 :maxdepth: 1
 :caption: TUTORIALS
 
 tutorials/debug_tools_for_tilelang
 tutorials/auto_tuning
+tutorials/logging
+:::
+
+:::{toctree}
+:maxdepth: 1
+:caption: PROGRAMMING GUIDES
+
+programming_guides/overview
+programming_guides/language_basics
+programming_guides/instructions
+programming_guides/control_flow
+programming_guides/python_compatibility
+programming_guides/autotuning
+programming_guides/type_system
 :::
 
 :::{toctree}
@@ -33,6 +46,7 @@ tutorials/auto_tuning
 deeplearning_operators/elementwise
 deeplearning_operators/gemv
 deeplearning_operators/matmul
+deeplearning_operators/matmul_sparse
 deeplearning_operators/deepseek_mla
 :::
 
@@ -42,6 +56,7 @@ deeplearning_operators/deepseek_mla
 
 compiler_internals/letstmt_inline
 compiler_internals/inject_fence_proxy
+compiler_internals/tensor_checks
 :::
 
 :::{toctree}
diff --git a/docs/programming_guides/autotuning.md b/docs/programming_guides/autotuning.md
new file mode 100644
index 000000000..9cc5a2d94
--- /dev/null
+++ b/docs/programming_guides/autotuning.md
@@ -0,0 +1,308 @@
+# Autotuning
+
+TileLang includes a built‑in autotuner that searches configuration spaces
+for the best performing kernel, compiles candidates in parallel, validates
+correctness, benchmarks them, and caches the best result for reuse.
+
+This guide covers two workflows:
+- Decorator‑based: `@tilelang.autotune(configs=...)` stacked on `@tilelang.jit`
+- Programmatic: `AutoTuner.from_kernel(...).set_*().run()`
+
+It also explains input tensor supply, validation, caching, and environment
+variables that affect parallelism and cache behavior.
+
+## 1) Decorator‑based Autotune
+
+Use `@tilelang.autotune` above `@tilelang.jit` and expose tunable parameters as
+function arguments with defaults. The autotuner overrides these parameters with
+values from your config space.
+
+```python
+import tilelang
+import tilelang.language as T
+
+def matmul_configs(M, N, K):
+    # Example space — tailor to your target
+    tiles = [64, 128]
+    stages = [2, 3]
+    threads = [128, 256]
+    return [
+        dict(block_M=BM, block_N=BN, block_K=BK, num_stages=S, threads=TH)
+        for BM in tiles
+        for BN in tiles
+        for BK in [32, 64]
+        for S in stages
+        for TH in threads
+    ]
+
+@tilelang.autotune(configs=matmul_configs, warmup=25, rep=100, timeout=60)
+@tilelang.jit(out_idx=[-1])
+def matmul(M: int, N: int, K: int,
+           block_M: int = 128, block_N: int = 128, block_K: int = 32,
+           threads: int = 128, num_stages: int = 3,
+           dtype: str = 'float16', accum_dtype: str = 'float32'):
+
+    @T.prim_func
+    def kernel(A: T.Tensor((M, K), dtype),
+               B: T.Tensor((K, N), dtype),
+               C: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), dtype)
+            B_s = T.alloc_shared((block_K, block_N), dtype)
+            C_f = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_f)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_f)
+
+            T.copy(C_f, C[by * block_M, bx * block_N])
+
+    return kernel
+
+# Usage
+# Provide inputs via context (recommended for reproducibility across configs)
+import torch
+M = N = K = 1024
+A = torch.randn(M, K, device='cuda', dtype=torch.float16)
+B = torch.randn(K, N, device='cuda', dtype=torch.float16)
+C = torch.empty(M, N, device='cuda', dtype=torch.float16)
+
+from tilelang.autotuner import set_autotune_inputs
+with set_autotune_inputs(A, B, C):
+    tuned_kernel = matmul(M, N, K)   # compiles, tunes, returns best kernel
+    tuned_kernel(A, B, C)            # run best kernel
+```
+
+Notes
+- `configs` can be a list of dicts or a callable `(args...) -> list[dict]`. Each
+  dict’s keys must match the tunable function arguments (e.g., `block_M`).
+- The decorator returns a callable that runs autotune once per argument tuple
+  and caches the resulting best kernel in‑process.
+- For explicit input control during tuning, wrap the call with
+  `set_autotune_inputs(...)`. Otherwise, `supply_type` (below) is used.
+
+## 2) Programmatic Autotune
+
+Use the `AutoTuner` class to manage configs and arguments more explicitly.
+
+```python
+from tilelang.autotuner import AutoTuner
+
+kernel_factory = matmul  # the function above (already @tilelang.jit)
+tuner = AutoTuner.from_kernel(kernel_factory(M, N, K), configs=matmul_configs(M, N, K))
+
+tuner.set_profile_args(
+    warmup=25, rep=100, timeout=60,
+    supply_type=tilelang.TensorSupplyType.Auto,  # or provide supply_prog/ref_prog
+    ref_prog=lambda A, B, C: torch.allclose(C, (A @ B).to(C.dtype), rtol=1e-2, atol=1e-2),
+)
+
+tuner.set_compile_args(
+    target='auto',                  # or 'cuda'/'hip'/'metal'
+    execution_backend='auto',       # resolves per-target
+    out_idx=[-1],                   # which outputs to return if multiple
+    pass_configs={                  # optional TVM passes/flags
+        # tilelang.PassConfigKey.EXAMPLE_KEY: value,
+    },
+)
+
+artifact = tuner.run()             # compiles + runs + validates all configs
+best_kernel = artifact.kernel      # JITKernel
+best_latency = artifact.latency
+best_config = artifact.config
+
+# Reuse best kernel
+best_kernel(A, B, C)
+```
+
+### Example Gallery (in repo)
+- examples/gdn/example_chunk_delta_h.py:101 — uses `@autotune` to sweep configs
+- examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py:451 — uses `@tilelang.autotune`
+- examples/quickstart.py:84 — profiles a tuned kernel with `get_profiler`
+- examples/hadamard_transform/example_hadamard.py:152 — profiler with custom warmup
+- examples/dynamic_shape/example_dynamic.py:94 — profiler for dynamic shapes
+- examples/gemm/example_gemm_persistent.py:135 — compare persistent vs non‑persistent
+
+Click any path to open the code and compare patterns.
+
+## Input Tensor Supply
+
+The tuner needs inputs to compile and benchmark kernels. Provide them in one of
+three ways (priority order):
+
+1) Context manager (fixed inputs across configs)
+```python
+with set_autotune_inputs(A, B, C):
+    tuned = matmul(M, N, K)
+```
+
+2) Custom supplier program
+```python
+def supply_prog(signature):
+    # signature holds KernelParam objects describing shapes/dtypes
+    # Return a list of torch tensors matching the kernel’s arguments
+    return [A, B, C]
+
+tuner.set_profile_args(supply_prog=supply_prog)
+```
+
+3) Built‑in generators via `supply_type`
+- `TensorSupplyType.Auto` (default): heuristic per dtype (uniform ints / fp ranges)
+- `Integer`, `Uniform`, `Normal`, `Randn`, `Zero`, `One`
+
+Important
+- Built‑in generators require static shapes; if your PrimFunc uses symbolic
+  dimensions (T.dyn), supply concrete inputs via (1) or (2).
+- Float8 dtypes require PyTorch 2.1+ for `torch.float8_*` support.
+
+## Correctness Checking and Tolerances
+
+Use one of the following validation methods:
+- `ref_prog`: Provide a reference program that receives the same inputs and
+  checks results. You can return a boolean or raise on mismatch.
+- `manual_check_prog`: A callable that inspects outputs and raises on mismatch.
+- `skip_check=True`: Skip correctness checks (faster, use with caution).
+
+Control numeric drift via:
+- `rtol` and `atol` (defaults 1e‑2)
+- `max_mismatched_ratio` (default 1%)
+
+## Configuration Spaces and Best Practices
+
+What to tune
+- Tile sizes: `block_M`, `block_N`, `block_K`
+- Software pipelining: `num_stages`
+- Threads per block: `threads` (or (x, y) tuple)
+- Optional: dtype variants, epilogues, small scheduling knobs
+
+Tips
+- Start from a working baseline. Tune a small, meaningful space first.
+- Respect hardware limits (shared memory bytes, registers per thread/block,
+  max threads per block). Eliminate impossible configs up‑front.
+- Keep block sizes multiples of vector widths and warp sizes when relevant.
+- Use `set_autotune_inputs` to ensure each config is measured on identical data.
+- Record your best configs and bake them as defaults when stable.
+
+## Parallel Compilation/Benchmarking and Timeouts
+
+The tuner compiles configurations in parallel using a thread pool and benchmarks
+them with a per‑config timeout. On CUDA, each worker sets the current device to
+avoid context issues.
+
+Notes
+- `timeout` uses POSIX signals; on non‑Unix systems, it may not take effect.
+- Logs are written to `autotuner.log` in the working directory.
+
+## Caching
+
+The autotuner caches best artifacts both in‑memory (per process) and on disk under
+`$TILELANG_CACHE_DIR/autotuner`. The cache key includes:
+- TileLang version, function source, closure free‑vars
+- Config list, compile args, profile args
+
+Disk cache contents (per key)
+- Best config and latency: `best_config.json`, `latency.json`
+- Kernel sources and library: `device_kernel.cu`, `host_kernel.cu`, `kernel_lib.so` (or `kernel.cubin`/`executable.so` depending on backend)
+- Function and params: `function.pkl`, `params.pkl`
+
+Control via env vars (tilelang.env)
+- `TILELANG_CACHE_DIR` (default `~/.tilelang/cache`)
+- `TILELANG_TMP_DIR` (default `$TILELANG_CACHE_DIR/tmp`)
+- Disable all kernel caches: `TILELANG_DISABLE_CACHE=1`
+- Disable autotune disk cache only: `TILELANG_AUTO_TUNING_DISABLE_CACHE=1`
+
+CPU worker control
+- `TILELANG_AUTO_TUNING_CPU_UTILITIES` (fraction, default 0.9)
+- `TILELANG_AUTO_TUNING_CPU_COUNTS` (int, `-1` auto)
+- `TILELANG_AUTO_TUNING_MAX_CPU_COUNT` (int, `-1` unlimited)
+
+Backend notes
+- NVRTC backend persists `.cubin` and a Python launcher.
+- Torch/DLPack backend may not save artifacts to disk; in this case, only
+  in‑memory caching applies and a warning is logged.
+
+## Alternative: Manual Sweeps with par_compile
+
+If you prefer manual control, use `JITImpl.par_compile` to compile a batch of
+configs and drive your own benchmarking:
+
+```python
+@tilelang.jit
+def factory(M, N, K, block_M=128, block_N=128, block_K=32):
+    @T.prim_func
+    def k(A: T.Tensor((M, K), 'float16'),
+           B: T.Tensor((K, N), 'float16'),
+           C: T.Tensor((M, N), 'float16')):
+        ...
+    return k
+
+impl = factory  # JITImpl
+cfgs = [
+    dict(block_M=64, block_N=128, block_K=32),
+    dict(block_M=128, block_N=128, block_K=64),
+]
+kernels = impl.par_compile(cfgs, num_workers=4)
+# Now benchmark kernels[i](A, B, C) yourself
+```
+
+## Recording and Reusing Best Configs
+
+The programmatic path returns an `AutotuneResult` that can be saved and later
+reloaded. This is useful for CI, multi‑host workflows, or shipping tuned configs.
+
+```python
+artifact = tuner.run()  # AutotuneResult
+
+# Save to disk
+from pathlib import Path
+save_dir = Path('out/best/matmul_1024')
+artifact.save_to_disk(save_dir, verbose=True)
+
+# Reload later
+from tilelang.autotuner.param import AutotuneResult, CompileArgs
+restored = AutotuneResult.load_from_disk(save_dir, CompileArgs())
+best = restored.kernel
+best(A, B, C)
+```
+
+Notes
+- DLPack/Torch execution backend may not persist compiled binaries; in that
+  case, re‑compilation is needed on load or use a different backend.
+- The directory contains human‑readable JSONs (best config/latency) and sources.
+
+## Advanced: Config Space Callables
+
+Derive config spaces from problem sizes to keep searches targeted and legal:
+
+```python
+def matmul_configs(M, N, K):
+    large = min(M, N, K) >= 1024
+    tiles = [128] if large else [64, 128]
+    for BM in tiles:
+        for BN in tiles:
+            for BK in [32, 64]:
+                for S in [2, 3]:
+                    for TH in [128, 256]:
+                        yield dict(block_M=BM, block_N=BN, block_K=BK,
+                                    num_stages=S, threads=TH)
+```
+
+## Device and Backend Selection
+
+Tune compile‑time options explicitly:
+- `target='auto'|'cuda'|'hip'|'metal'` (normalized to a TVM Target)
+- `execution_backend='auto'|'tvm_ffi'|'cython'|'nvrtc'|'torch'`
+- `pass_configs={...}` to toggle TileLang/TVM passes for experiments
+
+On CUDA with multiple GPUs, the tuner sets the current device per worker thread
+to avoid context mixups.
+
+## Troubleshooting
+- “No configurations to tune”: Ensure `configs` is a non‑empty list or callable.
+- Timeouts: Increase `timeout`; ensure inputs fit device memory; verify that
+  your reference check isn’t the bottleneck.
+- Dynamic shapes: Provide concrete inputs via `set_autotune_inputs` or a custom
+  `supply_prog`.
+- Disk cache disabled: Check `TILELANG_AUTO_TUNING_DISABLE_CACHE` and backend.
diff --git a/docs/programming_guides/control_flow.md b/docs/programming_guides/control_flow.md
new file mode 100644
index 000000000..259441349
--- /dev/null
+++ b/docs/programming_guides/control_flow.md
@@ -0,0 +1,149 @@
+# Control Flow
+
+This guide covers the control‑flow primitives in TileLang and how they lower to
+efficient GPU code. You will use these to structure loops, handle boundaries,
+and express pipelined compute.
+
+## Overview
+- Conditionals: `if` / `elif` / `else`, ternary (`x if c else y`)
+- Loops: `T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`
+- While loops: `while` with a TIR condition
+- Flow control: Python `break` / `continue`
+- Safety: automatic OOB guards via the LegalizeSafeMemoryAccess pass
+
+The examples assume `import tilelang.language as T`.
+
+## Conditionals
+
+Standard Python `if`/`elif`/`else` is supported inside `@T.prim_func` kernels.
+Conditions should be TIR expressions (e.g., `i < N`). Python plain booleans are
+treated as compile‑time constants and will be folded.
+
+```python
+for i in T.serial(N):
+    if i < N:            # TIR condition
+        C[i] = A[i] + B[i]
+    else:
+        pass
+
+# Ternary
+x = (A[i] if i < N else 0)
+```
+
+Short‑circuit boolean ops are supported. For multi‑dimensional bounds, use
+`T.any_of` / `T.all_of` for clarity:
+
+```python
+if T.all_of(i < M, j < N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Boundary handling note
+- The LegalizeSafeMemoryAccess pass automatically inserts guards when an access
+  may be out‑of‑bounds, and elides them when proven safe. You can often omit
+  explicit `if` checks for simple edge handling, but keep them when you need
+  custom logic or clarity.
+
+## Loops
+
+### Serial
+
+`T.serial` creates a plain for‑loop. Common forms:
+
+```python
+for i in T.serial(N):
+    ...                     # 0..N-1
+
+for i in T.serial(0, N, 2):
+    ...                     # 0, 2, 4, ...
+```
+
+### Unroll
+
+`T.unroll` requests loop unrolling for small trip counts.
+
+```python
+for k in T.unroll(K_TILE):
+    acc += a[k] * b[k]
+```
+
+Advanced: TileLang forwards unroll hints to TIR; factor/explicit knobs are
+available for expert tuning.
+
+### Parallel (elementwise)
+
+`T.Parallel(ext0, ext1, ...)` builds nested loops that map well to elementwise
+operations. The body receives all indices in one `for` header:
+
+```python
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Optional hints:
+- `coalesced_width=` controls memory coalescing width (used for vectorization checks).
+- `loop_layout=` accepts a `T.Fragment` to annotate the layout of the entire
+  nested parallel loop. The annotation is attached to the outermost loop only
+  and must have `InputDim == number of nested parallel extents`.
+
+### Pipelined (software pipelining)
+
+`T.Pipelined(iters, num_stages=...)` overlaps producer/consumer stages (e.g.,
+Global→Shared copies with compute). This is the backbone of GEMM/attention
+pipelines.
+
+```python
+for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    T.copy(A[by * BM, ko * BK], A_s)  # stage: copy A tile
+    T.copy(B[ko * BK, bx * BN], B_s)  # stage: copy B tile
+    T.gemm(A_s, B_s, C_f)             # stage: compute
+```
+
+### Persistent (advanced)
+
+`T.Persistent(domain, wave_size, index, group_size=...)` exposes persistent
+thread‑block style looping. It is an advanced construct that TileLang lowers in
+later passes and is typically used by specialized templates.
+
+## While Loops
+
+`while` is supported when the condition is a TIR expression. Avoid infinite
+loops; TileLang will error if it detects a constant‑true condition.
+
+```python
+i = 0
+while i < N:
+    ...
+    if done:
+        break
+    i += 1
+```
+
+## Break and Continue
+
+Use Python `break`/`continue` to exit or skip within `T.serial`/`T.unroll`/
+`T.Parallel`/`while` loops. Keep the body clean after a `break`/`continue` for
+readability; the compiler will ignore the dead path.
+
+## Putting It Together: Residual Tile Handling
+
+Below is a typical edge pattern for a 2D kernel. With LegalizeSafeMemoryAccess,
+the explicit guard can be omitted when you don’t need a custom edge path.
+
+```python
+for i, j in T.Parallel(M, N):
+    gi = by * BM + i
+    gj = bx * BN + j
+    if T.all_of(gi < M, gj < N):     # optional in many cases
+        C[gi, gj] = A[gi, gj] + B[gi, gj]
+```
+
+## Debugging Conditions
+
+Use `T.print` to inspect values under predicates. For buffers, TileLang prints
+from a single thread to avoid duplicate outputs.
+
+```python
+if i == 0:
+    T.print(C, msg='C tile:')
+```
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
new file mode 100644
index 000000000..20beb8325
--- /dev/null
+++ b/docs/programming_guides/instructions.md
@@ -0,0 +1,184 @@
+# Instructions
+
+This page summarizes the core TileLang “instructions” available at the DSL
+level, how they map to hardware concepts, and how to use them correctly.
+
+## Quick Categories
+- Data movement: `T.copy`, `T.c2d_im2col`, staging Global ↔ Shared ↔ Fragment
+- Compute primitives: `T.gemm`/`T.gemm_sp`, elementwise math (`T.exp`, `T.max`),
+  reductions (`T.reduce_sum`, `T.cumsum`, warp reducers)
+- Control helpers: `T.clear`/`T.fill`, `T.reshape`/`T.view`
+- Diagnostics: `T.print`, `T.device_assert`
+- Advanced: atomics, memory barriers, warp‑group ops
+
+## Data Movement
+
+Use `T.copy(src, dst, *, coalesced_width=None, disable_tma=False, eviction_policy=None, loop_layout=None)`
+to move tiles between memory scopes. It accepts `tir.Buffer`, `BufferLoad`, or
+`BufferRegion`; extents are inferred or broadcast when possible.
+
+```python
+# Global → Shared tiles (extents inferred from dst)
+T.copy(A[by * BM, ko * BK], A_s)
+T.copy(B[ko * BK, bx * BN], B_s)
+
+# Fragment/Register → Global (store result)
+T.copy(C_f, C[by * BM, bx * BN])
+```
+
+Semantics
+- Extents are deduced from arguments; missing sides broadcast to the other’s rank.
+- Access patterns are legalized and coalesced during lowering. Explicit
+  vectorization is not required in HL mode.
+- Safety: the LegalizeSafeMemoryAccess pass inserts boundary guards when an
+  access may be out‑of‑bounds and drops them when proven safe.
+
+Other helpers
+- `T.c2d_im2col(img, col, ...)`: convenience for conv‑style transforms.
+
+## Compute Primitives
+
+GEMM and sparse GEMM
+- `T.gemm(A_shared, B_shared, C_fragment)`: computes a tile GEMM using shared
+  inputs and a fragment accumulator; lowered to target‑specific tensor cores.
+- `T.gemm_sp(...)`: 2:4 sparse tensor core variant (see examples and README).
+
+Reductions and scans
+- `T.reduce_sum`, `T.reduce_max`, `T.reduce_min`, `T.cumsum`, plus warp
+  reducers (`T.warp_reduce_sum`, etc.).
+- Allocate and initialize accumulators via `T.alloc_fragment` + `T.clear` or
+  `T.fill`.
+
+Elementwise math
+- Most math ops mirror TVM TIR: `T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`,
+  `T.sigmoid`, etc. Compose freely inside loops.
+
+Reshape/view (no copy)
+- `T.reshape(buf, new_shape)` and `T.view(buf, shape=None, dtype=None)` create
+  new views that share storage, with shape/dtype checks enforced.
+
+## Synchronization (HL usage)
+
+In HL pipelines, you usually don’t need to write explicit barriers. Passes such
+as PipelinePlanning/InjectSoftwarePipeline/InjectTmaBarrier orchestrate
+producer/consumer ordering and thread synchronization behind the scenes.
+
+If you need debugging or explicit checks:
+- `T.device_assert(cond, msg='')` emits device‑side asserts on CUDA targets.
+- `T.print(obj, msg='...')` prints scalars or buffers safely from one thread.
+
+## Putting It Together: GEMM Tile
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)  # Global → Shared
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)             # compute into fragment
+
+        T.copy(C_f, C[by * BM, bx * BN])      # store back
+```
+
+## Instruction Reference (Concise)
+
+Below is a concise list of TileLang instructions grouped by category. For full
+signatures, behaviors, constraints, and examples, refer to API Reference
+(`autoapi/tilelang/index`).
+
+Data movement
+- `T.copy(src, dst, ...)`: Move tiles between Global/Shared/Fragment.
+- `T.c2d_im2col(img, col, ...)`: 2D im2col transform for conv.
+
+Memory allocation and descriptors
+- `T.alloc_shared(shape, dtype, scope='shared.dyn')`: Allocate shared buffer.
+- `T.alloc_fragment(shape, dtype, scope='local.fragment')`: Allocate fragment.
+- `T.alloc_var(dtype, [init], scope='local.var')`: Scalar var buffer (1 elem).
+- `T.alloc_barrier(arrive_count)`: Allocate and initialize one or more mbarriers.
+- `T.alloc_tmem(shape, dtype)`: Tensor memory (TMEM) buffer (Hopper+).
+- `T.alloc_reducer(shape, dtype, op='sum', replication=None)`: Reducer buf.
+- `T.alloc_descriptor(kind, dtype)`: Generic descriptor allocator.
+  - `T.alloc_wgmma_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_smem_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_instr_desc(dtype='uint32')`
+- `T.empty(shape, dtype='float32')`: Declare function output tensors.
+
+Compute primitives
+- `T.gemm(A_s, B_s, C_f)`: Tile GEMM into fragment accumulator.
+- `T.gemm_sp(...)`: Sparse (2:4) tensor core GEMM.
+- Reductions: `T.reduce_sum/max/min/abssum/absmax`, bitwise `and/or/xor`.
+- Scans: `T.cumsum`, finalize: `T.finalize_reducer`.
+- Warp reducers: `T.warp_reduce_sum/max/min/bitand/bitor`.
+- Elementwise math: TIR ops (`T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`, ...).
+- Fast math: `T.__log/__log2/__log10/__exp/__exp2/__exp10/__sin/__cos/__tan`.
+- IEEE math: `T.ieee_add/sub/mul/fmaf` (configurable rounding).
+- Helpers: `T.clear(buf)`, `T.fill(buf, value)`.
+- Views: `T.reshape(buf, shape)`, `T.view(buf, shape=None, dtype=None)`.
+
+Diagnostics
+- `T.print(obj, msg='')`: Print scalar/buffer from one thread.
+- `T.device_assert(cond, msg='')`: Device-side assert (CUDA).
+
+Logical helpers
+- `T.any_of(a, b, ...)`, `T.all_of(a, b, ...)`: Multi-term predicates.
+
+Annotation helpers
+- `T.use_swizzle(panel_size=..., enable=True)`: Rasterization hint.
+- `T.annotate_layout({...})`: Attach explicit layouts to buffers.
+- `T.annotate_safe_value(var, ...)`: Safety/const hints.
+- `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
+
+Synchronization helpers
+- `T.pdl_trigger()`: Signal programmatic launch completion for the current kernel.
+- `T.pdl_sync()`: Wait until kernel dependencies are satisfied.
+
+Atomics
+- `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
+- `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.
+- `T.atomic_max(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_min(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_load(dst)`, `T.atomic_store(dst, value)`.
+
+Custom intrinsics
+- `T.dp4a(A, B, C)`: 4‑element dot‑product accumulate.
+- `T.clamp(x, lo, hi)`: Clamp to [lo, hi].
+- `T.loop_break()`: Break from current loop via intrinsic.
+
+Barriers, TMA, warp‑group
+- Barriers: `T.alloc_barrier(arrive_count)`.
+- Parity ops: `T.mbarrier_wait_parity(barrier, parity)`, `T.mbarrier_arrive(barrier)`.
+- Expect tx: `T.mbarrier_expect_tx(...)`; sugar: `T.barrier_wait(id, parity=None)`.
+- TMA: `T.create_tma_descriptor(...)`, `T.tma_load(...)`,
+  `T.tma_store_arrive(...)`, `T.tma_store_wait(...)`.
+- Proxy/fences: `T.fence_proxy_async(...)`, `T.warpgroup_fence_operand(...)`.
+- Warp‑group: `T.warpgroup_arrive()`, `T.warpgroup_commit_batch()`,
+  `T.warpgroup_wait(num_mma)`, `T.wait_wgmma(id)`.
+
+Lane/warp index
+- `T.get_lane_idx(warp_size=None)`: Lane id in warp.
+- `T.get_warp_idx_sync(warp_size=None)`: Canonical warp id (sync).
+- `T.get_warp_idx(warp_size=None)`: Canonical warp id (no sync).
+- `T.get_warp_group_idx(warp_size=None, warps_per_group=None)`: Group id.
+
+Register control
+- `T.set_max_nreg(reg_count, is_inc)`, `T.inc_max_nreg(n)`, `T.dec_max_nreg(n)`.
+- `T.annotate_producer_reg_dealloc(n=24)`, `T.annotate_consumer_reg_alloc(n=240)`.
+- `T.no_set_max_nreg()`, `T.disable_warp_group_reg_alloc()`.
+
+## Notes on Dtypes
+
+Dtypes accept three equivalent forms:
+- String: `'float32'`
+- TileLang dtype: `T.float32`
+- Framework dtype: `torch.float32`
+All are normalized internally. See Type System for details.
diff --git a/docs/programming_guides/language_basics.md b/docs/programming_guides/language_basics.md
new file mode 100644
index 000000000..1152680c9
--- /dev/null
+++ b/docs/programming_guides/language_basics.md
@@ -0,0 +1,234 @@
+# Language Basics
+
+This page introduces the core TileLang (tile‑lang) DSL that you’ll use to write
+high‑performance kernels. It focuses on how to define a kernel, express
+iteration, move data across memory scopes, and run it with JIT.
+
+The examples use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## 1. Defining a Kernel with `@T.prim_func`
+
+TileLang kernels are TIR (TVM IR) functions produced by the `@T.prim_func`
+decorator. Arguments are annotated with shapes and dtypes via `T.Tensor` or
+`T.Buffer`.
+
+Note on dtypes
+- You can pass dtypes as a string (e.g., 'float32'), a TileLang dtype (e.g., `T.float32`),
+  or a framework dtype (e.g., `torch.float32`). TileLang normalizes all of these.
+  See Type System for details.
+
+```python
+@T.prim_func
+def add_kernel(
+    A: T.Tensor((N,), dtype),    # dtype could be 'float32' | T.float32 | torch.float32
+    B: T.Tensor((N,), dtype),
+    C: T.Tensor((N,), dtype),
+):
+    ...  # kernel body
+```
+
+- Shapes may be concrete integers or symbolic. For symbolic, you can pass
+  Python ints through the outer `@jit` wrapper (shown below), or annotate with
+  `T.dyn` when you want a named symbolic dimension.
+
+```python
+# Named symbolic dimension (optional)
+K = T.dyn['K']
+@T.prim_func
+def uses_dyn(A: T.Tensor((K,), 'float32')):
+    ...
+```
+
+### Dynamic symbolic dimensions: two ways
+
+TileLang supports two complementary ways to introduce symbolic (dynamic) dims:
+
+- Type-level annotations via `T.dyn[...]` (recommended for function signatures)
+  - Use in `T.Tensor((T.dyn['K'], ...), dtype)` or bind once then reuse (as above).
+  - Inside the kernel body, prefer reading from the buffer’s shape, e.g. `M = A.shape[0]`.
+
+- Term-level variables via `T.dynamic(name, dtype)`
+  - Creates a TIR `tir.Var` you can use directly in expressions/loops.
+  - Handy when you need to reference the dimension symbol in the body.
+
+```python
+# 1) Annotation-only symbol; read the bound size via shape
+K = T.dyn['K']  # dtype defaults to int32
+@T.prim_func
+def foo(A: T.Tensor((K,), 'float32')):
+    N = A.shape[0]
+    for i in T.serial(N):
+        ...
+
+# 2) Explicit Var symbol usable in the body
+K = T.dynamic('K', 'int32')   # or T.dynamic('K') defaults to int32
+@T.prim_func
+def bar(A: T.Tensor((K,), 'float32')):
+    for i in T.serial(K):
+        ...
+```
+
+Notes
+- `T.symbolic(name, dtype)` is a deprecated alias of `T.dynamic`; prefer `T.dynamic`.
+- Under `@jit`, concrete sizes come from the actual tensor arguments at the first call.
+- Symbols in annotations do not need to be separate kernel arguments; TileLang binds them from argument shapes.
+
+## 2. Launching Work with `T.Kernel`
+
+`with T.Kernel(...)` declares a launch context and creates block/thread
+bindings. For GPU backends, specify a grid and threads per block.
+
+```python
+with T.Kernel(grid_x, grid_y, threads=128) as (bx, by):
+    ...  # bx/by are blockIdx.x/y
+```
+
+You rarely need raw thread indices; most kernels use structured loops
+(`T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`) inside a `T.Kernel`.
+
+## 3. Loops and Control Flow
+
+Core loop constructs map to familiar hardware patterns:
+
+- `T.serial(start, stop[, step])`: plain for‑loop
+- `T.unroll(start, stop[, step])`: unrolled loop
+- `T.Parallel(ext0, ext1, ...)`: nested parallel loops (elementwise‑friendly)
+- `T.Pipelined(iters, num_stages=N)`: software pipelining for producer/consumer
+
+```python
+for i in T.serial(N):
+    ...
+
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+
+for k in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    # overlap copy/compute across stages
+    ...
+```
+
+Conditionals use standard Python `if`/`else`. Guard edges with predicates when
+tile sizes do not divide problem sizes evenly.
+
+## 4. Memory Scopes and Allocation
+
+TileLang exposes key software‑managed scopes:
+
+- Global: device memory (default for `T.Tensor` arguments)
+- Shared: on‑chip, block‑visible (`T.alloc_shared(shape, dtype)`)
+- Fragment and scalars: per‑thread fragments and scalar vars but in Shared View
+  (`T.alloc_fragment`, `T.alloc_var`)
+
+```python
+A_shared = T.alloc_shared((BM, BK), 'float16')
+B_shared = T.alloc_shared((BK, BN), 'float16')
+C_local  = T.alloc_fragment((BM, BN), 'float32')
+T.clear(C_local)  # zero accumulators
+```
+
+## 5. Moving Data: `T.copy`
+
+Use `T.copy(src, dst)` to move tiles between scopes. It accepts buffers,
+buffer regions, or buffer loads; extents are inferred or can be broadcast.
+
+```python
+# Global -> Shared (tile copy), extents inferred from dst
+T.copy(A[by * BM, ko * BK], A_shared)
+T.copy(B[ko * BK, bx * BN], B_shared)
+
+# Fragment -> Global (store back)
+T.copy(C_local, C[by * BM, bx * BN])
+```
+
+`T.copy` performs coalescing and scope‑specific lowering during compilation.
+
+## 6. A Minimal End‑to‑End Example (Vector Add)
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+
+@jit  # infers target from tensors at first call
+def add(N: int, block: int = 256, dtype: str = 'float32'):
+
+    @T.prim_func
+    def add_kernel(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block), threads=block) as bx:
+            for i in T.Parallel(block):
+                gi = bx * block + i
+                # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB
+                C[gi] = A[gi] + B[gi]
+
+    return add_kernel
+
+# Host side (PyTorch shown; NumPy/DLPack also supported)
+import torch
+N = 1 << 20
+A = torch.randn(N, device='cuda', dtype=torch.float32)
+B = torch.randn(N, device='cuda', dtype=torch.float32)
+C = torch.empty(N, device='cuda', dtype=torch.float32)
+
+kernel = add(N)
+kernel(A, B, C)  # runs on GPU
+torch.testing.assert_close(C, A + B)
+```
+
+Notes
+- The `@jit` wrapper returns a callable kernel after the first compilation.
+- You can pass compile‑time tunables (tile sizes, dtypes) through the outer
+  Python function and bake them into the generated TIR.
+
+## 7. Tiled GEMM Skeleton
+
+Below is a minimal pattern for a tiled GEMM using shared memory staging and a
+fragment accumulator. It mirrors the quickstart style found in the repository.
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)  # lowered to tensor‑core/ISA specific kernels
+
+        T.copy(C_f, C[by * BM, bx * BN])
+```
+
+## 8. Debugging and Printing
+
+Use `T.print` inside a kernel for quick introspection. TileLang emits printing
+from a single thread for shared/fragment scopes to avoid floods.
+
+```python
+T.print(C_f, msg='accumulator:')
+T.print(A_s, msg='A tile:')
+T.print(C[0], msg='C[0] = ')
+```
+
+## 9. Where to Go Next
+
+- Control flow details: see Programming Guides → Control Flow
+- Memory topics: see Programming Guides → (removed cache/layout); basics are covered inline
+- Autotuning tile sizes and mappings: Programming Guides → Autotuning
+- Operator examples (GEMM, GEMV, attention): see Deep Learning Operators
diff --git a/docs/programming_guides/overview.md b/docs/programming_guides/overview.md
new file mode 100644
index 000000000..64b6d2039
--- /dev/null
+++ b/docs/programming_guides/overview.md
@@ -0,0 +1,27 @@
+# Programming Guides Overview
+
+This section provides a practical guide to writing high‑performance kernels with Tile Language (tile‑lang).
+It mirrors the structure of a similar guide in another project and adapts it to tile‑lang concepts and APIs.
+
+- Audience: Developers implementing custom GPU/CPU kernels with tile‑lang
+- Prereqs: Basic Python, NumPy/Tensor concepts, and familiarity with GPU programming notions
+- Scope: Language basics, control flow, instructions, autotuning, and type system
+
+## What You’ll Learn
+- How to structure kernels with TileLang’s core DSL constructs
+- How to move data across global/shared/fragment and pipeline compute
+- How to apply autotuning to tile sizes and schedules
+- How to specify and work with dtypes in kernels
+
+## Suggested Reading Order
+1. Language Basics
+2. Control Flow
+3. Instructions
+4. Autotuning
+5. Type System
+
+## Related Docs
+- Tutorials: see existing guides in `tutorials/`
+- Operators: examples in `deeplearning_operators/`
+
+> NOTE: This is a draft scaffold. Fill in code snippets and benchmarks as APIs evolve.
diff --git a/docs/programming_guides/python_compatibility.md b/docs/programming_guides/python_compatibility.md
new file mode 100644
index 000000000..b858e392a
--- /dev/null
+++ b/docs/programming_guides/python_compatibility.md
@@ -0,0 +1,59 @@
+# Python Compatibility
+
+TileLang is a Python-embedded DSL, but not all Python syntax is supported inside
+TileLang DSL. This guide clarifies what works, what doesn't, and how
+to translate common Python patterns into TileLang equivalents. Specially, we focus on
+the kernel part (scripts inside `with T.Kernel`) semantics. For host-side semantics when
+using eager-style JIT, please stay tuned for our upcoming documentation.
+
+The following codes use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## Control Flow & Loops
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `for i in range(n)`     | ✅        | Maps to `T.serial(n)`                    |
+| `for i in range(a,b,s)` | ✅        | Maps to `T.serial(a, b, s)`              |
+| `for x in list`         | ❌        | Use index-based loop                     |
+| `while condition`       | ✅        |                                          |
+| `if` / `elif` / `else`  | ✅        |                                          |
+| `x if cond else y`      | ✅        | Ternary expression                       |
+| `break` / `continue`    | ✅        |                                          |
+| `enumerate()` / `zip()`    | ❌     |                                          |
+
+## Data Access
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `a[i]` indexing         | ✅        | Multi-dim indexing supported: `a[i, j, k]` |
+| `a[i:j]` slicing        | ✅        | Creates `BufferRegion`                   |
+| `a[-1]` negative index  | ✅        |                                          |
+
+## Assignment & Arithmetic Operations
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `x = expr`              | ✅        |                                          |
+| `+`, `-`, `*`, `/`, `%` | ✅        | Maps to device-side arithmetic operations |
+| `+=`, `-=`, `*=`, etc.  | ✅        | Augmented assignment                     |
+| `a = b = c`             | ❌        | Use separate assignments                 |
+
+## Functions & Classes
+
+As a kernel script language, TileLang doesn't support functions or classes. You can use `@T.macro` to define reusable code blocks, which will be inlined at compile time like `__device__` function.
+
+## Statements & Built-in Functions
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `with`                  | ⚠️        | Only `T.Kernel`, `T.ws`                  |
+| `assert`                | ⚠️        | Use `T.device_assert` or `T.assert`      |
+| `print()`               | ⚠️        | Use `T.print()`; `print` works for Python expressions |
+| `len()`                 | ❌        | Use `buffer.shape[dim]`                  |
+| `type()`, `isinstance()`| ❌        |                                          |
diff --git a/docs/programming_guides/type_system.md b/docs/programming_guides/type_system.md
new file mode 100644
index 000000000..60061df3f
--- /dev/null
+++ b/docs/programming_guides/type_system.md
@@ -0,0 +1,41 @@
+# Type System
+
+This page lists the data types supported by TileLang and how to specify them in
+kernels. For full details and the authoritative list, see the API Reference
+(`autoapi/tilelang/index`) and `tilelang.language.v2.dtypes`.
+
+How to specify dtypes
+- Use any of the following forms; TileLang normalizes them internally:
+  - String: `'float32'`, `'int8'`, `'bfloat16'`, ...
+  - TileLang dtype object: `T.float32`, `T.int8`, `T.bfloat16`, ...
+  - Framework dtype: `torch.float32`, `torch.int8`, `torch.bfloat16`, ...
+
+Common scalar types
+- Boolean: `bool`
+- Signed integers: `int8`, `int16`, `int32`, `int64`
+- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
+- Floating‑point: `float16` (half), `bfloat16`, `float32`, `float64`
+
+Float8 and low‑precision families
+- Float8: `float8_e3m4`, `float8_e4m3`, `float8_e4m3b11fnuz`, `float8_e4m3fn`,
+  `float8_e4m3fnuz`, `float8_e5m2`, `float8_e5m2fnuz`, `float8_e8m0fnu`
+- Float6: `float6_e2m3fn`, `float6_e3m2fn`
+- Float4: `float4_e2m1fn`
+
+Vectorized element types (SIMD packs)
+- For many base types, vector‑packed variants are available by lane count:
+  `x2`, `x4`, `x8`, `x16`, `x32`, `x64`.
+- Examples:
+  - Integers: `int8x2`, `int8x4`, ..., `int32x2`, `int32x4`, ...
+  - Unsigned: `uint8x2`, `uint8x4`, ...
+  - Floats: `float16x2`, `float16x4`, `float32x2`, `float32x4`, ...
+  - Float8/6/4 families also provide `x2/x4/x8/x16/x32/x64` where applicable,
+    e.g., `float8_e4m3x2`, `float8_e4m3x4`, `float6_e2m3fnx8`, `float4_e2m1fnx16`.
+
+Notes
+- Availability of certain low‑precision formats (float8/6/4) depends on target
+  architecture and backend support.
+- Choose accumulation dtypes explicitly for mixed‑precision compute (e.g.,
+  GEMM with `float16` inputs and `float32` accumulators).
+- The complete, up‑to‑date list is exposed in
+  `tilelang.language.v2.dtypes` and rendered in the API Reference.
diff --git a/docs/tutorials/auto_tuning.md b/docs/tutorials/auto_tuning.md
index 3f3cad832..33368a2f0 100644
--- a/docs/tutorials/auto_tuning.md
+++ b/docs/tutorials/auto_tuning.md
@@ -14,7 +14,7 @@ Auto-tuning a Tile Language program involves three main steps:
 
 ## Matrix Multiplication Example
 
-The following example demonstrates auto-tuning matrix multiplication. Code has been simplified for readability - see `examples/gemm/example_gemm.py` for complete implementation. 
+The following example demonstrates auto-tuning matrix multiplication. Code has been simplified for readability - see `examples/gemm/example_gemm.py` for complete implementation.
 
 ### Step 1: Implement with Reserved Parameters
 Users can implement matrix multiplication in Tile Language while reserving parameters for optimization:
@@ -145,4 +145,4 @@ for hint in roller_hints:
     config["thread_num"] = block_rows * block_cols * 32
     config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
 
-```
\ No newline at end of file
+```
diff --git a/docs/tutorials/debug_tools_for_tilelang.md b/docs/tutorials/debug_tools_for_tilelang.md
index e18b13279..078440f34 100644
--- a/docs/tutorials/debug_tools_for_tilelang.md
+++ b/docs/tutorials/debug_tools_for_tilelang.md
@@ -12,7 +12,6 @@ A Tile Language program (hereafter referred to as a *program*) is transformed in
 2. The program undergoes multiple *Passes* for transformation and optimization (the *lower* stage, see `tilelang/engine/lower.py`), finally producing an intermediate representation (e.g., LLVM or C for CPU, CUDA for NVIDIA GPUs, etc.).
 3. The generated code is compiled by the respective compiler (e.g., nvcc) into a hardware-executable file.
 
-
 ```{figure} ../_static/img/overview.png
 :width: 300
 :alt: Overview of the compilation process
@@ -22,9 +21,9 @@ A Tile Language program (hereafter referred to as a *program*) is transformed in
 
 During this process, users may encounter roughly three categories of issues:
 
-* **Generation issues**: The Tile Language program fails to generate a valid hardware-executable file (i.e., errors during the lowering process).
-* **Correctness issues**: The resulting executable runs, but produces incorrect results.
-* **Performance issues**: The executable runs with performance significantly below the expected theoretical hardware limits.
+- **Generation issues**: The Tile Language program fails to generate a valid hardware-executable file (i.e., errors during the lowering process).
+- **Correctness issues**: The resulting executable runs, but produces incorrect results.
+- **Performance issues**: The executable runs with performance significantly below the expected theoretical hardware limits.
 
 This tutorial focuses on the first two issues—how to debug generation and correctness problems. Performance tuning often requires using vendor-provided profiling tools (e.g., **Nsight Compute**, **rocProf**, etc.) for further hardware-level analysis, which we will address in future materials.
 
@@ -52,7 +51,6 @@ func = matmul(1024, 1024, 1024, 128, 128, 32)
 
 TileLang essentially performs *progressive lowering*. For example, a `T.copy` may first be expanded into `T.Parallel` (see the pass `LowerTileOP`), which is then expanded again, eventually resulting in lower-level statements that can be translated to CUDA C code.
 
-
 ```{figure} ../_static/img/ir_transform_diagram.png
 :width: 400
 :alt: IR transformation diagram
@@ -171,8 +169,138 @@ The output messages will include something like:
 msg='hello world' BlockIdx=(0, 0, 0), ThreadIdx=(0, 0, 0): 0
 ```
 
+### Visual Layout Inference For TileLang
+ The **Visual Layout Inference** tool automatically generates visual diagrams that illustrate the mapping between logical indices, thread IDs, and register file locations.
+
+When TileLang performs layout inference, it determines how fragment buffers are distributed across threads. The visual layout tool captures this information and generates:
+1. **Textual output**: A human-readable description of the layout mapping
+2. **Visual diagrams**: Color-coded plots showing the thread-to-data mapping
+
+The visual layout inference tool is controlled through the `TL_LAYOUT_VISUALIZATION_ENABLE` and `TL_LAYOUT_VISUALIZATION_FORMATS` pass configuration. By default, `TL_LAYOUT_VISUALIZATION_ENABLE` is **disabled** to avoid performance overhead during compilation.
+
+When enabled, `TL_LAYOUT_VISUALIZATION_FORMATS` accepts string values to control output formats:
+- "txt": Text output only (same as default)
+- "all": Generates all formats (TXT, PDF, PNG, SVG)
+- "png": Generate PNG format only
+- "pdf": Generate PDF format only
+- "svg": Generate SVG format only
+- "txt,svg": Generate multiple formats (comma-separated) in addition to text output
+
+The output messages of "txt" will include something like:
+```
+C_local inferenced layout:
+  Shape: [32, 32] -> [8]
+  Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+  Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+```
+
+## AutoDD: Automatic Delta Debugging
+
+When dealing with complex TileLang programs that produce errors, manually isolating the bug can be tedious. **AutoDD** (Automatic Delta Debugging) is a built-in tool that automatically simplifies your program to the minimal code needed to reproduce a specific error.
+
+### What is Delta Debugging?
+
+Delta Debugging is an automated debugging technique that:
+1. Takes a program that triggers a bug
+2. Systematically removes code fragments
+3. Checks if the simplified program still triggers the same bug
+4. Produces the minimal code that reproduces the bug
+
+AutoDD uses a Probability Distribution Driven Delta Debugging (PDD) algorithm for efficient minimization.
+
+### Why Use AutoDD?
+
+- **Large codebases**: Real projects often have hundreds of lines of configuration, helper functions, and logging
+- **Hard-to-locate errors**: Error messages may point to TVM/CUDA internals rather than your TileLang code
+- **Time-saving**: Manually deleting code to isolate bugs is very time-consuming
+
+AutoDD can reduce a 200+ line program to just 30 lines, directly exposing the root cause.
+
+### Basic Usage
+
+```bash
+python -m tilelang.autodd <source_file> --err-msg "<error_message>" -o <output_file>
+```
+
+### Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `source` | Path to the input Python source file |
+| `--err-msg` | Error message to match (searched in stdout or stderr) |
+| `-o, --output` | Path to the minimized output file |
+| `--backend` | Execution backend: `runner` (faster) or `subproc` (more stable), default `runner` |
+| `--timeout` | Timeout for each task in seconds, default 60 |
+| `-j, --jobs` | Number of parallel jobs, default 1 |
+
+### Example
+
+Suppose you have a complex TileLang program with a GEMM shape mismatch bug:
+
+```python
+# buggy_matmul.py (200+ lines)
+@tilelang.jit
+def buggy_matmul(M, N, K, block_M, block_N, block_K, ...):
+    @T.prim_func
+    def matmul_kernel(...):
+        with T.Kernel(...) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug: should be (block_K, block_N)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            # ... lots of other code ...
+            T.gemm(A_shared, B_shared, C_local)  # Error here
+    return matmul_kernel
+```
+
+Run AutoDD to minimize:
+
+```bash
+python -m tilelang.autodd buggy_matmul.py --err-msg "Dimension mismatch" -o minimized.py -j 4
+```
+
+AutoDD will produce a minimal reproduction:
+
+```python
+# minimized.py (~30 lines)
+import tilelang.language as T
+
+def buggy_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, *args, **kwargs):
+    @T.prim_func
+    def matmul_kernel():
+        with T.Kernel():
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug exposed!
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.gemm(A_shared, B_shared, C_local)
+```
+
+### How AutoDD Works
+
+AutoDD uses AST (Abstract Syntax Tree) analysis with multiple rewrite rules:
+
+1. **Fast Reducers**: Remove statements, simplify if/for constructs
+2. **Canonicalizers**: Expand with statements, add `*args, **kwargs` for compatibility
+3. **Simplifiers**: Replace expressions with constants, simplify function calls
+4. **Slow Reducers**: Remove arbitrary expressions, reduce integer constants
+
+### Tips
+
+- **Error message matching**: Use a unique substring from the error output
+- **Timeout**: Increase `--timeout` for programs with long compilation times
+- **Parallel jobs**: Use `-j 4` or higher to speed up minimization
+- **Backend**: Try `--backend subproc` if `runner` is unstable
+
+### Complete Example
+
+A complete example is available in `examples/autodd/`:
+- `tilelang_buggy.py`: A complex program with a bug (~200 lines)
+- `tilelang_minimized_expected.py`: Expected output after AutoDD (~30 lines)
+- `README.md`: Detailed documentation
+
 ## Conclusion
 
 By carefully examining intermediate representations (IR) before final code generation—and by leveraging runtime printing through `T.print`—one can quickly diagnose where index calculations, copy logic, or other kernel operations deviate from the intended behavior. This two-pronged approach (inspecting IR transformations and using runtime prints) is often sufficient for resolving generation and correctness issues in TileLang programs.
 
+For complex programs where manual debugging is tedious, **AutoDD** provides automated delta debugging to quickly isolate the minimal code that reproduces a bug.
+
 For advanced performance tuning (e.g., analyzing memory bandwidth or occupancy), more specialized profiling tools such as **Nsight Compute**, **rocProf**, or vendor-specific profilers may be required. Those aspects will be covered in future documents.
diff --git a/docs/tutorials/logging.md b/docs/tutorials/logging.md
new file mode 100644
index 000000000..1a015432d
--- /dev/null
+++ b/docs/tutorials/logging.md
@@ -0,0 +1,116 @@
+Logging in Tilelang/TVM
+===================================================
+<div style="text-align: left;">
+<em>Author:</em> <a href="https://github.com/SiriusNEO">SiriusNEO</a>
+</div>
+
+## TVM Logging Overview
+
+Tilelang currently utilizes the logging system from TVM. The implementation can be found in:
+
+- [include/tvm/runtime/logging.h](https://github.com/apache/tvm/blob/main/include/tvm/runtime/logging.h): Macro definitions
+- [src/runtime/logging.cc](https://github.com/apache/tvm/blob/main/src/runtime/logging.cc): Logging logic implementation
+
+The design style is inspired by [Google's glog](https://google.github.io/glog/stable/).
+
+## Logging Categories
+
+There are three primary macro types:
+
+```c++
+LOG(INFO) << "aaa";
+DLOG(INFO) << "aaa";
+VLOG(1) << "aaa";
+```
+
+- **LOG**: Standard logging preserved in code for displaying necessary information at different levels during runtime. Most Tilelang C++ error reporting is implemented via `LOG(FATAL) << "error msg"`.
+- **DLOG**: Debug logging for developer debugging output. DLOG is controlled at build time by the TVM_LOG_DEBUG environment variable and is **eliminated in Release builds through dead code elimination**.
+  - The key difference between LOG(DEBUG) and DLOG is this build-time elimination. We recommend using DLOG over LOG(DEBUG), as the latter has overlapping functionality and gets compiled into the release runtime.
+- **VLOG**: [Verbose logging](https://google.github.io/glog/stable/logging/#verbose-logging), primarily for debugging. Its main feature is customizable verbosity levels. For example, VLOG(n) where n can be 1, 2, 3, 4, 5, or 6, enabling complex tracing requirements. In contrast, LOG and DLOG typically use predefined verbose levels like INFO and DEBUG.
+  - In practical Tilelang development, VLOG is used less frequently.
+  - TVM's VLOG is implemented using DLOG, thus inheriting DLOG's characteristics.
+
+Additional useful macros include various **CHECK** variants:
+
+```c++
+CHECK(cond) << "error msg";
+DCHECK(cond) << "error msg";
+ICHECK(cond) << "error msg";
+```
+
+The implementation routes errors to LogFatal:
+
+```c++
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: (" #x << ") is false: "
+```
+- **DCHECK**: Debug mode CHECK, only compiled in debug builds
+- **ICHECK**: Internal Check that should exist in Release builds. When ICHECK fails, the entire system should report an error.
+
+## Logging Verbose Levels
+
+TVM defines 5 levels for LOG and DLOG (adding DEBUG compared to glog):
+
+```c++
+#define TVM_LOG_LEVEL_DEBUG 0
+#define TVM_LOG_LEVEL_INFO 1
+#define TVM_LOG_LEVEL_WARNING 2
+#define TVM_LOG_LEVEL_ERROR 3
+#define TVM_LOG_LEVEL_FATAL 4
+```
+
+## Using Logging in TileLang Development
+
+### Guidelines
+
+For temporary debugging output in your code, there are no restrictions (you can even use std::cout). Just remember to remove it before submitting a PR.
+
+For meaningful logging that should remain in the Tilelang codebase:
+
+- Critical correctness checks: Use ICHECK with sufficient error messages to facilitate debugging when issues arise.
+- Complex Pass debugging: For passes requiring intermediate output that may need future review (e.g., LayoutInference), use DLOG.
+- General INFO/WARNING messages: Use standard LOG.
+
+### Enabling Log Output in Tilelang
+
+To specify current log level at runtime, we need to set the environment variable `TVM_LOG_LEVEL`. An example usage is:
+
+```c++
+TVM_LOG_DEBUG=1 python3 code.py
+```
+
+which enables all DEBUG/INFO (level <= 1) logs for all files.
+
+#### Detailed Rules for TVM_LOG_DEBUG Specification
+
+The parsing logic is in `logging.cc`. Reference: [HyperAI Zhihu Article](https://zhuanlan.zhihu.com/p/1933106843468665163).
+
+Launch Python with `TVM_LOG_DEBUG=<spec>`, where `<spec>` is a comma-separated list of level assignments in the form `<file_name>=<level>`. Important notes:
+
+- The special filename DEFAULT sets the LOG level for all files.
+- `<level>` can be set to -1 to disable LOG for that file.
+- `<file_name>` is the C++ source filename (e.g., .cc, not .h) relative to the `src/` directory in the TVM repository. The `src/` prefix is optional when specifying file paths.
+
+### Enabling Debug Mode
+
+To enable DLOG/DCHECK, developers need to first build Tilelang in Debug mode:
+
+```bash
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_CUDA=ON
+```
+
+Tilelang's CMake logic automatically adds the `TVM_LOG_DEBUG` macro, compiling all DLOG statements:
+
+```cmake
+target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
+```
+
+Then you also need to specify the runtime environment variables. For example, to use `DLOG(INFO) << "xxx"` for debugging, run your code with INFO level (1): `TVM_LOG_DEBUG=1`.
+
+:::{note}
+   **Important**: There are two TVM_LOG_DEBUG variables. (1) Compile-time macro: Determines whether debug content (like DLOG) is compiled into the .so file. Referenced in C++ source via #ifdef TVM_LOG_DEBUG. This is automatically enabled when using Debug build mode in CMake. (2) Runtime environment variable: Controls logging level at runtime. TVM provides a specification for this variable, allowing control over per-file logging levels.
+
+   These two should ideally have different names, but TVM uses the same name for both, which can cause confusion.
+:::
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d47866e1e..27986ce78 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -2,7 +2,7 @@
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
@@ -11,22 +11,20 @@
 
 
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K_ref = K.repeat_interleave(groups, dim=2)
     V_ref = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K_ref)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K_ref)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V_ref)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V_ref)
     lse = torch.logsumexp(scores, dim=-1).float()
     return output, lse
 
@@ -45,23 +43,23 @@ def get_fwd_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
@@ -85,23 +83,23 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -111,10 +109,10 @@ def main(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx_loop_var = T.alloc_var("int32")
+            bx_loop_var = T.alloc_var(T.int32)
             bx_loop_var = b_split
 
-            with T.While(bx_loop_var < num_q_blocks):
+            while bx_loop_var < num_q_blocks:
                 acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
                 m_i = T.alloc_fragment([block_M], accum_dtype)
                 l_i = T.alloc_fragment([block_M], accum_dtype)
@@ -135,33 +133,21 @@ def main(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = (
-                    T.ceildiv(q_block_offset +
-                              block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -178,6 +164,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         if m_prev[i] == -T.infinity(accum_dtype):
@@ -214,8 +202,7 @@ def main(
 
                 for i in T.Parallel(block_M):
                     if q_block_offset + i < seq_len:
-                        lse_val = T.if_then_else(l_i[i] > 0,
-                                                 T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
+                        lse_val = T.if_then_else(l_i[i] > 0, T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
                         LSE[bz, by, q_block_offset + i] = lse_val
 
                 bx_loop_var = current_bx + num_split_q
@@ -232,30 +219,30 @@ def get_bwd_configs():
     panel_size = [7, 8, 9, 10]
 
     configs = []
-    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads,
-                                                   enable_rasterization, panel_size):
-        configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_stages": stages,
-            "threads": t,
-            "enable_rasterization": r,
-            "panel_size": p,
-        })
+    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads, enable_rasterization, panel_size):
+        configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_stages": stages,
+                "threads": t,
+                "enable_rasterization": r,
+                "panel_size": p,
+            }
+        )
 
     return configs
 
 
 @tilelang.jit(out_idx=[2])
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
-    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
-                       Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
+    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype), Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
         with T.Kernel(batch, heads, T.ceildiv(seq_len, blk)) as (bz, bx, by):
             o = T.alloc_fragment([blk, blk], dtype)
             do = T.alloc_fragment([blk, blk], dtype)
@@ -263,36 +250,51 @@ def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 @tilelang.autotune(configs=get_bwd_configs(), cache_input_tensors=True)
 @tilelang.jit
-def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, block_N: int,
-                  num_stages: int, threads: int, enable_rasterization: bool, panel_size: int):
-    sm_scale = (1.0 / dim)**0.5
+def flashattn_bwd(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups,
+    block_M: int,
+    block_N: int,
+    num_stages: int,
+    threads: int,
+    enable_rasterization: bool,
+    panel_size: int,
+):
+    sm_scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
-    def flash_bwd_kernel(Q: T.Tensor(q_shape,
-                                     dtype), K: T.Tensor(kv_shape,
-                                                         dtype), V: T.Tensor(kv_shape, dtype),
-                         dO: T.Tensor(q_shape, dtype), lse: T.Tensor([batch, heads, seq_len],
-                                                                     accum_dtype),
-                         Delta: T.Tensor([batch, heads, seq_len],
-                                         accum_dtype), dQ: T.Tensor(q_shape, accum_dtype),
-                         dK: T.Tensor(kv_shape, accum_dtype), dV: T.Tensor(kv_shape, accum_dtype)):
+    def flash_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             T.use_swizzle(panel_size, enable=enable_rasterization)
 
@@ -313,8 +315,8 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
             dk = T.alloc_fragment([block_M, dim], accum_dtype)
             dq = T.alloc_fragment([block_N, dim], accum_dtype)
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
@@ -322,22 +324,21 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
             loop_ed = T.ceildiv(seq_len, block_N)
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q_shared)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q_shared)
                 T.clear(qkT)
 
                 T.gemm(K_shared, q_shared, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     P_acc[i, j] = T.exp(qkT[i, j] * sm_scale - lse_shared[j])
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j,
-                                                     P_acc[i, j], 0.0)
+                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, P_acc[i, j], 0.0)
 
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do_shared)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do_shared)
                 T.clear(dP)
 
                 T.gemm(V_shared, do_shared, dP, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -345,7 +346,7 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
                 T.copy(P_acc, p_cast)
                 T.gemm(p_cast, do_shared, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta_shared)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
@@ -367,8 +368,8 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
 
 @tilelang.jit(out_idx=[1])
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
@@ -376,8 +377,8 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     def flash_bwd_post(dQ_in: T.Tensor(shape, accum_dtype), dQ_out: T.Tensor(shape, dtype)):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.copy(
-                dQ_in[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ_in[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
@@ -444,22 +445,14 @@ def benchmark_function(func, *args, warmup=10, repeat=100):
     return np.median(times)
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     device = "cuda"
     dtype = torch.float16
 
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
 
-    print(
-        f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}"
-    )
+    print(f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}")
 
     flops_per_gemm = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 5 * flops_per_gemm
@@ -515,22 +508,19 @@ def main(batch: int = 1,
     o_ref.backward(dO)
 
     print("Verifying backward pass correctness...")
-    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(
-        dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
+    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
     if dq_close:
         print("dQ is correct.")
     else:
         print("dQ mismatch detected.")
 
-    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(
-        dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
+    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
     if dk_close:
         print("dK is correct.")
     else:
         print("dK mismatch detected.")
 
-    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(
-        dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
+    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
     if dv_close:
         print("dV is correct.")
     else:
@@ -551,9 +541,7 @@ def run_reference_fwd_bwd():
             torch.cuda.synchronize()
 
     ref_latency = benchmark_function(run_reference_fwd_bwd, warmup=10, repeat=100)
-    print(
-        f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops")
 
     def run_complete_fwd_bwd():
         o_tl_bench, lse_tl_bench = fwd_kernel(q, k, v)
@@ -591,12 +579,12 @@ def run_complete_fwd_bwd():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=1024, help='sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=1024, help="sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ec5db1e5..581619220 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -2,29 +2,42 @@
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
 
 
+# Custom supply function to ensure tensors are created on GPU
+def supply_tensors_gpu(params):
+    """Supply function that creates tensors on GPU for ROCm/HIP."""
+    tensors = []
+    for param in params:
+        if hasattr(param, "shape") and hasattr(param, "dtype"):
+            # Force creation on GPU device
+            shape = [int(s) for s in param.shape]
+            tensor = torch.randn(shape, dtype=param.dtype, device="cuda")
+            tensors.append(tensor)
+        else:
+            tensors.append(param)
+    return tensors
+
+
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -43,27 +56,27 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
-@tilelang.autotune(configs=get_configs(), cache_input_tensors=True)
+@tilelang.autotune(configs=get_configs(), cache_input_tensors=True, supply_prog=supply_tensors_gpu)
 @tilelang.jit(out_idx=[3])
 def fast_flashattn(
     batch,
@@ -83,22 +96,22 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -108,10 +121,10 @@ def main(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx = T.alloc_var("int32")
+            bx = T.alloc_var(T.int32)
             bx = b_split
 
-            with T.While(bx < num_q_blocks):
+            while bx < num_q_blocks:
                 acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
                 m_i = T.alloc_fragment([block_M], accum_dtype)
                 l_i = T.alloc_fragment([block_M], accum_dtype)
@@ -132,32 +145,21 @@ def main(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = T.ceildiv(q_block_offset + block_M,
-                                       block_N) if is_causal else T.ceildiv(seq_len, block_N)
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -171,6 +173,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
@@ -205,13 +209,7 @@ def main(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
@@ -233,18 +231,16 @@ def main(batch: int = 1,
     print(f"Reference (PyTorch): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
     latency = profiler.do_bench(warmup=100)
-    print(
-        f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/analyze/README.md b/examples/analyze/README.md
index 8171d8826..1c2788b0b 100644
--- a/examples/analyze/README.md
+++ b/examples/analyze/README.md
@@ -21,9 +21,9 @@ M = N = K = 1024
 
 def kernel(block_M=128, block_N=128, block_K=32, num_stages=3, thread_num=128):
     @T.prim_func
-    def main(A: T.Tensor((M, K), "float16"),
-             B: T.Tensor((N, K), "float16"),
-             C: T.Tensor((M, N), "float")):
+    def main(A: T.Tensor((M, K), T.float16),
+             B: T.Tensor((N, K), T.float16),
+             C: T.Tensor((M, N), T.float)):
         # ... (kernel definition)
     return main
 
@@ -40,9 +40,9 @@ from tilelang.carver.arch import CUDA
 
 def kernel(N=64, C=256, H=512, W=512, F=512, K=3, block_M=64, block_N=128):
     @T.prim_func
-    def main(data: T.Tensor((N, H, W, C), "float16"),
-             kernel: T.Tensor((K, K, C, F), "float16"),
-             out: T.Tensor((N, (H-K+1), (W-K+1), F), "float")):
+    def main(data: T.Tensor((N, H, W, C), T.float16),
+             kernel: T.Tensor((K, K, C, F), T.float16),
+             out: T.Tensor((N, (H-K+1), (W-K+1), F), T.float)):
         # ... (convolution kernel definition)
     return main
 
@@ -64,10 +64,10 @@ class AnalysisResult:
 ```
 ### `Analyzer` Class Methods
 #### `analysis(fn, device)`
-* ​Parameters:
-    * fn: TVM IRModule or PrimFunc
-    * device: Device configuration object
-* Returns: AnalysisResult
+- ​Parameters:
+  - fn: TVM IRModule or PrimFunc
+  - device: Device configuration object
+- Returns: AnalysisResult
 #### Supported Architectures
 ```python
 # Extendable to custom hardware via: "compute_capability": (cores_per_SM, clock_GHz, flops_per_cycle, max_SM_count)
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index 540fcf4b7..06e5a86e9 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -2,7 +2,6 @@
 from tilelang.tools import Analyzer
 from tilelang.carver.arch import CUDA
 from tilelang.carver.arch import CDNA
-from tilelang.layout import make_swizzled_layout
 import torch
 
 N = 64
@@ -25,38 +24,21 @@ def check_hopper():
     return False
 
 
-def kernel(N,
-           C,
-           H,
-           W,
-           F,
-           K,
-           S,
-           D,
-           P,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           threads,
-           dtype="float16",
-           accum_dtype="float"):
+def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def conv(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -65,12 +47,6 @@ def conv(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: make_swizzled_layout(out_shared),
-                data_shared: make_swizzled_layout(data_shared),
-                kernel_shared: make_swizzled_layout(kernel_shared),
-            })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -81,10 +57,8 @@ def conv(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index bfd934f6a..0367af126 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -15,14 +15,14 @@ def kernel(
     thread_num=None,
     enable_rasteration=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/attention_sink/README.md b/examples/attention_sink/README.md
index ed4b7004e..2cba8f0cc 100644
--- a/examples/attention_sink/README.md
+++ b/examples/attention_sink/README.md
@@ -2,7 +2,6 @@
 
 We compare with an optimized version of the official Triton implementation [here](https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py).
 
-
 ## Algorithm
 ### Forward
 The only change from vanilla FlashAttention is that `sinks` should be taken into consideration in the softmax, which requires an extra rescaling at the epilogue stage.
@@ -43,4 +42,4 @@ where $P_{b, h, q}$ is the proportion of $sink_h$ in the softmax in the $b$-th b
 | 16384   |   64    | 309.46        | **400.62**           | 1.29x   |
 | 16384   |  128    | 418.99        | **549.11**           | 1.31x   |
 
-> The backward performance will be further optimized in the future.
\ No newline at end of file
+> The backward performance will be further optimized in the future.
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 1b7de6b6f..211ef1d18 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -51,8 +52,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -120,7 +120,8 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
@@ -135,14 +136,14 @@ def main(
     dtype: str = "float16",
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -170,15 +171,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         if torch.allclose(
-                triton_program(Q, K, V, sinks, window_size),
-                ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-                rtol=1e-2,
-                atol=1e-2):
+            triton_program(Q, K, V, sinks, window_size), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        ):
             print("Checks for triton passed.✅")
         else:
             print("Checks for triton failed.❌")
@@ -198,20 +198,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index f50b94535..50747e6b0 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -50,8 +51,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -117,26 +117,29 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -163,15 +166,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: triton_program(Q, K, V, sinks, window_size), warmup=500)
@@ -184,19 +186,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index eec43db99..cfdcd21b5 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -13,50 +13,50 @@ def get_bwd_configs():
     sm_version = sm_major * 10 + sm_minor
     if sm_version == 80:
         return 64, 32, 1, 128
-    elif sm_version == 90:
-        return 128, 32, 2, 256
     else:
-        raise ValueError(f"Unsupported SM version: {sm_version}")
+        return 128, 32, 2, 256
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        groups=1,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    groups=1,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            Output: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        Output: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -72,8 +72,7 @@ def flash_fwd(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([heads], dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -81,31 +80,30 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -122,32 +120,33 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -156,65 +155,61 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim,
-                  groups,
-                  window_size=None,
-                  sm_scale=None,
-                  dtype="float16"):  # None for full attention
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype=T.float16):  # None for full attention
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
@@ -223,15 +218,15 @@ def flashattn_bwd(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            dO: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        dO: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -251,44 +246,44 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
             dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx // groups, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx // groups, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx // groups, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx // groups, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -297,50 +292,46 @@ def flash_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dv_shared)
+            T.atomic_add(dV[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dk_shared)
+            T.atomic_add(dK[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dk_shared)
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=256) as (bx, by, bz):
-            sink = T.alloc_local([1], dtype)
             lse_fragment = T.alloc_fragment([block], accum_dtype)
             delta_fragment = T.alloc_fragment([block], accum_dtype)
             dsink_fragment = T.alloc_fragment([block], dtype)
 
-            sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size, groups):
-
         def maybe_contiguous(x):
             if x.stride(-1) != 1:
                 return x.contiguous()
@@ -348,7 +339,7 @@ def maybe_contiguous(x):
 
         q, k, v, sinks = [maybe_contiguous(x) for x in (q, k, v, sinks)]
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -361,7 +352,7 @@ def backward(ctx, do):
         q, k, v, sinks, o, lse = ctx.saved_tensors
         BATCH, H, N_CTX, D_HEAD = q.shape
         groups = ctx.groups
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
 
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
@@ -386,13 +377,14 @@ def backward(ctx, do):
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -428,32 +420,32 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 8,
-         N_CTX: int = 512,
-         D_HEAD: int = 64,
-         groups: int = 2,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
-    K = torch.randn(
-        BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     V = torch.randn_like(K).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device="cuda").requires_grad_()
     dO = torch.randn_like(Q)
@@ -474,19 +466,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -505,19 +492,57 @@ def tl_bwd():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    with torch.no_grad():
+        Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        V = torch.randn_like(K)
+        sinks = torch.randn(H, dtype=torch_dtype, device="cuda")
+        dO = torch.randn_like(Q)
+        fwd = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
+        O, lse = fwd(Q, K, V, sinks)
+
+        def maybe_contiguous(x):
+            return x if x.stride(-1) == 1 else x.contiguous()
+
+        do, q, k, v, sinks_c, o = [maybe_contiguous(x) for x in (dO, Q, K, V, sinks, O)]
+        k_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
+        Delta = k_prep(o, do)
+        k_bwd = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
+        k_dsink = flashattn_bwd_dsink(BATCH, H, N_CTX, dtype=dtype)
+        q_shape = (BATCH, H, N_CTX, D_HEAD)
+        head_kv = H // groups
+        kv_shape = (BATCH, head_kv, N_CTX, D_HEAD)
+        dq = torch.zeros(q_shape, dtype=torch.float32, device="cuda")
+        dk = torch.zeros(kv_shape, dtype=torch.float32, device="cuda")
+        dv = torch.zeros(kv_shape, dtype=torch.float32, device="cuda")
+        k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+        _ = k_dsink(sinks_c, Delta, lse).sum(0).sum(1)
+
+        def run_kernel_only():
+            k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+
+        latency_ms = do_bench(run_kernel_only, backend="cupti")
+        return latency_ms
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument('--groups', type=int, default=8, help='Groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--groups", type=int, default=8, help="Groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.groups, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_varlen.py b/examples/attention_sink/example_gqa_sink_bwd_varlen.py
new file mode 100644
index 000000000..64a5a39a8
--- /dev/null
+++ b/examples/attention_sink/example_gqa_sink_bwd_varlen.py
@@ -0,0 +1,798 @@
+import torch
+import tilelang
+from tilelang.profiler import do_bench
+import tilelang.language as T
+import argparse
+from typing import Optional
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../flash_attention"))
+from varlen_utils import generate_random_padding_mask, generate_qkv
+
+
+def get_bwd_configs():
+    sm_major, sm_minor = torch.cuda.get_device_capability()
+    sm_version = sm_major * 10 + sm_minor
+    if sm_version == 80:
+        return 64, 32, 1, 128
+    else:
+        return 128, 32, 2, 256
+
+
+@tilelang.jit(
+    out_idx=[6, 7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_fwd(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    N_CTX,
+    heads,
+    max_seq_len,
+    dim,
+    is_causal,
+    window_size=None,  # None for full causal attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype=T.float16,
+):
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    o_shape = [UQ, heads, dim]
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        Sinks: T.Tensor([heads], dtype),
+        Output_unpad: T.Tensor(o_shape, dtype),
+        lse: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+            sinks = T.alloc_fragment([block_M], dtype)
+
+            batch_idx = bz
+            head_idx = by
+            kv_head_idx = head_idx // groups
+
+            q_start_idx = cu_seqlens_q[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
+            q_end_idx = cu_seqlens_q[batch_idx + 1]
+            k_end_idx = cu_seqlens_k[batch_idx + 1]
+
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
+
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            for i in T.Parallel(block_M):
+                sinks[i] = Sinks[head_idx]
+
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
+
+            # Determine loop range based on causal mask and sliding window
+            if is_causal:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                else:
+                    start = 0
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+            else:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+                else:
+                    start = 0
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+
+            loop_range = end - start
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                actual_k = k + start
+                T.copy(K_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], K_shared)
+
+                # Build mask considering causal, sliding window, and padding
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx < k_idx)
+                                or (q_idx >= k_idx + window_size)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i + offset < actual_k * block_N + j)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx >= k_idx + window_size)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(V_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], V_shared)
+                T.copy(scores_max, scores_max_prev)
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
+                # Handle case where scores_max is -inf (query sees no keys due to causal mask or sliding window)
+                # This can happen when q_len > k_len (offset < 0) in causal attention, or with sliding window
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+
+                T.copy(acc_s, acc_s_cast)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+
+            # Attention sink: add sink contribution to logsum
+            for i in T.Parallel(block_M):
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)
+
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
+
+            for i, d in T.Parallel(block_M, dim):
+                if bx * block_M + i < q_current_seqlen:
+                    Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = acc_o[i, d]
+
+            for i in T.Parallel(block_M):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            for i in T.Parallel(block_M):
+                if bx * block_M + i < q_current_seqlen:
+                    lse[bz, head_idx, bx * block_M + i] = logsum[i]
+
+    return main
+
+
+@tilelang.jit(
+    out_idx=[3],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_bwd_preprocess(batch_size, heads, UQ, N_CTX, max_seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [UQ, heads, dim]
+    blk = 32
+
+    @T.prim_func
+    def flash_bwd_prep(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        Delta: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch_size) as (bx, by, bz):
+            o = T.alloc_fragment([blk, blk], dtype)
+            do = T.alloc_fragment([blk, blk], dtype)
+            acc = T.alloc_fragment([blk, blk], accum_dtype)
+            delta = T.alloc_fragment([blk], accum_dtype)
+
+            q_start_idx = cu_seqlens_q[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+
+            T.clear(acc)
+            for k in range(T.ceildiv(dim, blk)):
+                for i, j in T.Parallel(blk, blk):
+                    if by * blk + i < q_current_seqlen and k * blk + j < dim:
+                        o[i, j] = O[q_start_idx + by * blk + i, bx, k * blk + j]
+                        do[i, j] = dO[q_start_idx + by * blk + i, bx, k * blk + j]
+                    else:
+                        o[i, j] = 0.0
+                        do[i, j] = 0.0
+                for i, j in T.Parallel(blk, blk):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+
+            for i in T.Parallel(blk):
+                if by * blk + i < q_current_seqlen:
+                    Delta[bz, bx, by * blk + i] = delta[i]
+
+    return flash_bwd_prep
+
+
+def make_dq_layout(dQ):
+    # Reorder dq for atomic add: [seq, head, dim] -> permuted layout
+    return T.Layout(dQ.shape, lambda l, h, d: [h, l, d])
+
+
+@tilelang.jit(
+    out_idx=[1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_bwd_postprocess(UQ, heads, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [UQ, heads, dim]
+    blk = 64
+
+    @T.prim_func
+    def flash_bwd_post(
+        dQ: T.Tensor(shape, accum_dtype),
+        dQ_out: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(UQ, blk), heads, threads=128) as (bx, by):
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(
+                dQ[bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bx * blk : (bx + 1) * blk, by, :],
+            )
+
+    return flash_bwd_post
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    N_CTX,
+    heads,
+    max_seq_len,
+    dim,
+    is_causal,
+    window_size=None,
+    sm_scale=None,
+    dtype=T.float16,
+):
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    accum_dtype = T.float32
+
+    block_M, block_N, num_stages, threads = get_bwd_configs()
+
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    @T.prim_func
+    def flash_bwd(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+        Delta: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch_size, threads=threads) as (bx, by, bz):
+            K_shared = T.alloc_shared([block_M, dim], dtype)
+            dsT_shared = T.alloc_shared([block_M, block_N], dtype)
+            q = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_M, dim], dtype)
+            qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            dsT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            qkT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            dsT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            lse_shared = T.alloc_shared([block_N], accum_dtype)
+            delta = T.alloc_shared([block_N], accum_dtype)
+            do = T.alloc_shared([block_N, dim], dtype)
+            dv = T.alloc_fragment([block_M, dim], accum_dtype)
+            dk = T.alloc_fragment([block_M, dim], accum_dtype)
+            dq = T.alloc_fragment([block_N, dim], accum_dtype)
+            dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
+            dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
+
+            q_start_idx = cu_seqlens_q[bz]
+            kv_start_idx = cu_seqlens_k[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            k_end_idx = cu_seqlens_k[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
+            T.clear(dv)
+            T.clear(dk)
+
+            # For varlen causal attention, we need to account for offset between q and kv lengths
+            # In forward: Q at pos q can see KV at pos k if q + offset >= k (where offset = kv_len - q_len)
+            # In backward: KV at pos kv_pos is seen by Q at pos q_pos if kv_pos <= q_pos + offset
+            offset = kv_current_seqlen - q_current_seqlen
+
+            # loop_st: first Q block that can see this KV block
+            # kv_pos <= q_pos + offset => by * block_M <= k * block_N + offset
+            # => k >= (by * block_M - offset) / block_N
+            loop_st = T.max(0, T.floordiv(by * block_M - offset, block_N)) if is_causal else 0
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M - offset + window_size, block_N), T.ceildiv(q_current_seqlen, block_N))
+                if window_size is not None
+                else T.ceildiv(q_current_seqlen, block_N)
+            )
+
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], q)
+                T.clear(qkT)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Causal: kv_pos <= q_pos + offset
+                            # Sliding window: kv_pos > q_pos + offset - window_size
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i <= k * block_N + j + offset)
+                                and (by * block_M + i > k * block_N + j + offset - window_size)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Causal: kv_pos <= q_pos + offset
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i <= k * block_N + j + offset)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i > k * block_N + j + offset - window_size)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            qkT[i, j] = T.if_then_else(
+                                by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen,
+                                qkT[i, j],
+                                0,
+                            )
+
+                T.copy(dO[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], dst=do)
+                T.clear(dsT)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(qkT, qkT_cast)
+                T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
+
+                for i, j in T.Parallel(block_M, block_N):
+                    dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(dsT_cast, dsT_shared)
+                T.clear(dq)
+                T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
+                T.atomic_add(dQ[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], dq)
+
+            T.copy(dv, dv_shared)
+            T.atomic_add(dV[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], dv_shared)
+            T.copy(dk, dk_shared)
+            T.atomic_add(dK[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], dk_shared)
+
+    return flash_bwd
+
+
+@tilelang.jit(out_idx=-1)
+def flashattn_bwd_dsink(batch_size, heads, N_CTX, max_seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [batch_size, heads, N_CTX]
+
+    @T.prim_func
+    def flash_bwd_dsink(
+        Sinks: T.Tensor([heads], dtype),
+        Delta: T.Tensor(shape, accum_dtype),
+        lse: T.Tensor(shape, accum_dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        dsinks: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block), batch_size, threads=256) as (bx, by, bz):
+            lse_fragment = T.alloc_fragment([block], accum_dtype)
+            delta_fragment = T.alloc_fragment([block], accum_dtype)
+            dsink_fragment = T.alloc_fragment([block], dtype)
+
+            # Get actual sequence length for this batch item
+            q_start_idx = cu_seqlens_q[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
+            for i in T.Parallel(block):
+                # Only compute for valid positions, set 0 for positions beyond sequence length
+                dsink_fragment[i] = T.if_then_else(
+                    by * block + i < q_current_seqlen,
+                    -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i],
+                    0,
+                )
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
+
+    return flash_bwd_dsink
+
+
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, q_unpad, k_unpad, v_unpad, sinks, cu_seqlens_q, cu_seqlens_k, N_CTX, max_seqlen_q, max_seqlen_k, window_size, groups, is_causal
+    ):
+        def maybe_contiguous(x):
+            if x.stride(-1) != 1:
+                return x.contiguous()
+            return x
+
+        q_unpad, k_unpad, v_unpad, sinks = [maybe_contiguous(x) for x in (q_unpad, k_unpad, v_unpad, sinks)]
+        UQ, H, D_HEAD = q_unpad.shape
+        UKV = k_unpad.shape[0]
+        batch_size = cu_seqlens_q.shape[0] - 1
+        dtype = T.float16 if q_unpad.dtype == torch.float16 else T.bfloat16
+
+        kernel = flashattn_fwd(
+            batch_size,
+            groups,
+            UQ,
+            UKV,
+            N_CTX,
+            H,
+            max_seqlen_q,
+            D_HEAD,
+            is_causal,
+            window_size=window_size,
+            block_M=64,
+            block_N=64,
+            num_stages=1,
+            threads=128,
+            dtype=dtype,
+        )
+        o_unpad, lse = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, sinks)
+
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, sinks, o_unpad, lse, cu_seqlens_q, cu_seqlens_k)
+        ctx.window_size = window_size
+        ctx.groups = groups
+        ctx.is_causal = is_causal
+        ctx.N_CTX = N_CTX
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.batch_size = batch_size
+        return o_unpad
+
+    @staticmethod
+    def backward(ctx, do):
+        q_unpad, k_unpad, v_unpad, sinks, o_unpad, lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
+        UQ, H, D_HEAD = q_unpad.shape
+        UKV = k_unpad.shape[0]
+        groups = ctx.groups
+        batch_size = ctx.batch_size
+        dtype = T.float16 if q_unpad.dtype == torch.float16 else T.bfloat16
+
+        kernel_prep = flashattn_bwd_preprocess(batch_size, H, UQ, ctx.N_CTX, ctx.max_seqlen_q, D_HEAD, dtype=dtype)
+        kernel_post = flashattn_bwd_postprocess(UQ, H, D_HEAD, dtype=dtype)
+        delta = kernel_prep(o_unpad, do, cu_seqlens_q)
+
+        kernel = flashattn_bwd(
+            batch_size,
+            groups,
+            UQ,
+            UKV,
+            ctx.N_CTX,
+            H,
+            ctx.max_seqlen_q,
+            D_HEAD,
+            ctx.is_causal,
+            window_size=ctx.window_size,
+            dtype=dtype,
+        )
+
+        head_kv = H // groups
+        dq = torch.zeros_like(q_unpad, dtype=torch.float32)
+        dk = torch.zeros([UKV, head_kv, D_HEAD], dtype=torch.float32, device=q_unpad.device)
+        dv = torch.zeros([UKV, head_kv, D_HEAD], dtype=torch.float32, device=q_unpad.device)
+
+        kernel(q_unpad, k_unpad, v_unpad, do, lse, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
+        dq = kernel_post(dq)
+        dk = dk.to(q_unpad.dtype)
+        dv = dv.to(q_unpad.dtype)
+
+        kernel_dsink = flashattn_bwd_dsink(batch_size, H, ctx.N_CTX, ctx.max_seqlen_q, dtype=dtype)
+        dsinks = kernel_dsink(sinks, delta, lse, cu_seqlens_q).sum(0).sum(1)
+
+        return dq, dk, dv, dsinks, None, None, None, None, None, None, None, None
+
+
+attention = _attention.apply
+
+
+def ref_program(
+    q_unpad: torch.Tensor,
+    k_unpad: torch.Tensor,
+    v_unpad: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sinks: torch.Tensor,
+    batch_size: int,
+    is_causal: bool,
+    sliding_window: Optional[int] = None,
+    groups: int = 1,
+) -> torch.Tensor:
+    """Reference implementation for varlen attention with sinks."""
+    total_q, num_heads, head_dim = q_unpad.shape
+    _, num_key_value_heads, _ = k_unpad.shape
+
+    sm_scale = 1.0 / head_dim**0.5
+
+    output = torch.zeros_like(q_unpad)
+
+    for b in range(batch_size):
+        q_start = cu_seqlens_q[b].item()
+        q_end = cu_seqlens_q[b + 1].item()
+        k_start = cu_seqlens_k[b].item()
+        k_end = cu_seqlens_k[b + 1].item()
+
+        q_len = q_end - q_start
+        k_len = k_end - k_start
+
+        if q_len == 0:
+            continue
+
+        q_seq = q_unpad[q_start:q_end]  # [q_len, heads, dim]
+        k_seq = k_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+        v_seq = v_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+
+        # Reshape for GQA
+        q_seq = q_seq.view(q_len, num_key_value_heads, groups, head_dim)
+        sinks_expanded = sinks.view(num_key_value_heads, groups, 1, 1).float()
+
+        k_seq = k_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+        v_seq = v_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+
+        logits = torch.einsum("qhgd,khgd->hgqk", q_seq.float(), k_seq.float()) * sm_scale
+
+        start_q = k_len - q_len
+        pos_keys = torch.arange(k_len, device=q_unpad.device)
+        pos_queries = torch.arange(q_len, device=q_unpad.device) + start_q
+
+        if is_causal:
+            mask = pos_keys[None, :] > pos_queries[:, None]
+            mask = mask.float().masked_fill(mask, float("-inf"))
+        else:
+            mask = torch.zeros(q_len, k_len, device=q_unpad.device)
+
+        if sliding_window is not None:
+            too_old = pos_keys[None, :] < (pos_queries[:, None] - sliding_window + 1)
+            mask.masked_fill_(too_old, float("-inf"))
+
+        logits = logits + mask[None, None, :, :]
+
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(sinks_expanded, logits_max)
+        sinks_exp = torch.exp(sinks_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks_exp
+        scores = unnormalized_scores / normalizer
+
+        out = torch.einsum("hgqk,khgd->qhgd", scores, v_seq.float())
+        out = out.reshape(q_len, num_heads, head_dim).to(q_unpad.dtype)
+
+        output[q_start:q_end] = out
+
+    return output
+
+
+def main(
+    batch: int = 1,
+    heads: int = 64,
+    q_seqlen: int = 2048,
+    k_seqlen: int = 2048,
+    dim: int = 128,
+    groups: int = 16,
+    is_causal: bool = True,
+    window_size: Optional[int] = None,
+):
+    assert heads % groups == 0, "heads must be divisible by groups"
+
+    flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
+    total_flops = 5 * flops_per_matmul  # fwd + bwd
+
+    if is_causal:
+        total_flops *= 0.5
+
+    if window_size is not None:
+        print(f"Using sliding window attention with window_size={window_size}")
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, k_seqlen // 2) * q_seqlen * dim
+        total_flops = 5 * flops_per_matmul
+
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    head_kv = heads // groups
+    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device)
+    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    sinks = torch.randn(heads, dtype=dtype, device=device)
+
+    query_padding_mask = generate_random_padding_mask(q_seqlen, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(k_seqlen, batch, device, mode="random")
+
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+
+    q_unpad = q_unpad.requires_grad_(True)
+    k_unpad = k_unpad.requires_grad_(True)
+    v_unpad = v_unpad.requires_grad_(True)
+    sinks = sinks.requires_grad_(True)
+
+    dO_unpad = torch.randn_like(q_unpad)
+
+    # TileLang forward + backward
+    # N_CTX is the padded sequence length used for tensor allocation
+    N_CTX = q_seqlen
+    O_unpad = attention(
+        q_unpad, k_unpad, v_unpad, sinks, cu_seqlens_q, cu_seqlens_k, N_CTX, max_seqlen_q, max_seqlen_k, window_size, groups, is_causal
+    )
+    O_unpad.backward(dO_unpad, retain_graph=True)
+    dQ, q_unpad.grad = q_unpad.grad.clone(), None
+    dK, k_unpad.grad = k_unpad.grad.clone(), None
+    dV, v_unpad.grad = v_unpad.grad.clone(), None
+    dsinks, sinks.grad = sinks.grad.clone(), None
+
+    # Reference forward + backward
+    O_ref_unpad = ref_program(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        sinks,
+        batch,
+        is_causal,
+        sliding_window=window_size,
+        groups=groups,
+    )
+    O_ref_unpad.backward(dO_unpad, retain_graph=True)
+    dQ_ref, q_unpad.grad = q_unpad.grad.clone(), None
+    dK_ref, k_unpad.grad = k_unpad.grad.clone(), None
+    dV_ref, v_unpad.grad = v_unpad.grad.clone(), None
+    dsinks_ref, sinks.grad = sinks.grad.clone(), None
+
+    # Checks
+    # Sliding window attention has slightly higher numerical error due to more complex masking
+    rtol, atol = (2e-2, 2e-2) if window_size is not None else (1e-2, 1e-2)
+    assert torch.allclose(O_unpad, O_ref_unpad, rtol=rtol, atol=atol), f"O max err: {(O_unpad - O_ref_unpad).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dQ max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
+
+    print("All checks passed for tilelang kernels.✅")
+
+    # Benchmark backward
+    def torch_bwd():
+        O_ref_unpad.backward(dO_unpad, retain_graph=True)
+
+    def tl_bwd():
+        O_unpad.backward(dO_unpad, retain_graph=True)
+
+    latency = do_bench(torch_bwd, warmup=500)
+    print("torch: {:.2f} ms".format(latency))
+    print("torch: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+    latency = do_bench(tl_bwd, warmup=500)
+    print("tilelang: {:.2f} ms".format(latency))
+    print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
+    parser.add_argument("--window_size", type=int, default=None, help="sliding window size (default: None for full attention)")
+    args = parser.parse_args()
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal, args.window_size)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 7765603af..fa73df0af 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -6,7 +6,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -23,9 +22,11 @@ def get_configs():
     rep=100,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,106 +40,30 @@ def flashattn(
     block_N=128,
     num_stages=2,
     threads=256,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, head_kv, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -155,58 +80,83 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -242,23 +192,15 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        groups,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, groups, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
@@ -270,17 +212,17 @@ def main(
     dim: int = 128,
     groups: int = 8,
     window_size: Optional[int] = None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -308,15 +250,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         # Benchmark tilelang
@@ -325,22 +266,51 @@ def main(
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency_tilelang * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    groups: int = 8,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch,
+        heads,
+        seq_q,
+        seq_kv,
+        dim,
+        groups,
+        window_size,
+        block_M=block_M,
+        block_N=block_N,
+        num_stages=num_stages,
+        threads=threads,
+        dtype=dtype,
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_varlen.py b/examples/attention_sink/example_gqa_sink_fwd_varlen.py
new file mode 100644
index 000000000..16838dd86
--- /dev/null
+++ b/examples/attention_sink/example_gqa_sink_fwd_varlen.py
@@ -0,0 +1,401 @@
+# ruff: noqa
+# Using varlen (variable length) format with attention sink
+
+import argparse
+import torch
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang.profiler import do_bench
+from typing import Optional
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../flash_attention"))
+from varlen_utils import generate_random_padding_mask, generate_qkv
+
+
+@tilelang.jit(
+    out_idx=[7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_sink(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    heads,
+    dim,
+    is_causal,
+    window_size=None,  # None for full causal attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+):
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    o_shape = [UQ, heads, dim]
+    dtype = T.float16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Sinks: T.Tensor([heads], dtype),
+        Output_unpad: T.Tensor(o_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+            sinks = T.alloc_fragment([block_M], dtype)
+
+            batch_idx = bz
+            head_idx = by
+            kv_head_idx = head_idx // groups
+
+            q_start_idx = cu_seqlens_q[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
+            q_end_idx = cu_seqlens_q[batch_idx + 1]
+            k_end_idx = cu_seqlens_k[batch_idx + 1]
+
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
+
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            for i in T.Parallel(block_M):
+                sinks[i] = Sinks[head_idx]
+
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
+
+            # Determine loop range based on causal mask and sliding window
+            if is_causal:
+                if window_size is not None:
+                    # Sliding window + causal: start from window boundary
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                else:
+                    # Full causal attention
+                    start = 0
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+            else:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+                else:
+                    start = 0
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+
+            loop_range = end - start
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                actual_k = k + start
+                T.copy(K_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], K_shared)
+
+                # Build mask considering causal, sliding window, and padding
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            # Causal + sliding window mask
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx < k_idx)  # causal: can't see future
+                                or (q_idx >= k_idx + window_size)  # sliding window: too old
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i + offset < actual_k * block_N + j)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx >= k_idx + window_size)  # sliding window: too old
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
+                # Check_inf for sliding window attention
+                if window_size is not None:
+                    for i in T.Parallel(block_M):
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], V_shared)
+
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Attention sink: add sink contribution to logsum
+            for i in T.Parallel(block_M):
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)
+
+            for i, j in T.Parallel(block_M, dim):
+                # When sq > skv, some tokens can see nothing (for causal)
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
+
+            T.copy(acc_o, O_shared)
+            for i, d in T.Parallel(block_M, dim):
+                if bx * block_M + i < q_current_seqlen:
+                    Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
+
+    return main
+
+
+def ref_program(
+    q_unpad: torch.Tensor,
+    k_unpad: torch.Tensor,
+    v_unpad: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sinks: torch.Tensor,
+    batch_size: int,
+    is_causal: bool,
+    sliding_window: Optional[int] = None,
+    groups: int = 1,
+) -> torch.Tensor:
+    """Reference implementation for varlen attention with sinks."""
+    # q_unpad: [total_q, heads, dim]
+    # k_unpad: [total_kv, head_kv, dim]
+    # v_unpad: [total_kv, head_kv, dim]
+    total_q, num_heads, head_dim = q_unpad.shape
+    _, num_key_value_heads, _ = k_unpad.shape
+
+    sm_scale = 1.0 / head_dim**0.5
+
+    output = torch.zeros_like(q_unpad)
+
+    for b in range(batch_size):
+        q_start = cu_seqlens_q[b].item()
+        q_end = cu_seqlens_q[b + 1].item()
+        k_start = cu_seqlens_k[b].item()
+        k_end = cu_seqlens_k[b + 1].item()
+
+        q_len = q_end - q_start
+        k_len = k_end - k_start
+
+        if q_len == 0:
+            continue
+
+        # Extract sequences for this batch
+        q_seq = q_unpad[q_start:q_end]  # [q_len, heads, dim]
+        k_seq = k_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+        v_seq = v_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+
+        # Reshape for GQA
+        q_seq = q_seq.view(q_len, num_key_value_heads, groups, head_dim)  # [q_len, head_kv, groups, dim]
+        sinks_expanded = sinks.view(num_key_value_heads, groups, 1, 1).float()  # [head_kv, groups, 1, 1]
+
+        k_seq = k_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+        v_seq = v_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+
+        # Compute attention
+        # q_seq: [q_len, head_kv, groups, dim], k_seq: [k_len, head_kv, 1, dim]
+        logits = torch.einsum("qhgd,khgd->hgqk", q_seq.float(), k_seq.float()) * sm_scale
+
+        # Build mask
+        start_q = k_len - q_len  # offset for causal alignment
+        pos_keys = torch.arange(k_len, device=q_unpad.device)
+        pos_queries = torch.arange(q_len, device=q_unpad.device) + start_q
+
+        if is_causal:
+            mask = pos_keys[None, :] > pos_queries[:, None]
+            mask = mask.float().masked_fill(mask, float("-inf"))
+        else:
+            mask = torch.zeros(q_len, k_len, device=q_unpad.device)
+
+        if sliding_window is not None:
+            too_old = pos_keys[None, :] < (pos_queries[:, None] - sliding_window + 1)
+            mask.masked_fill_(too_old, float("-inf"))
+
+        logits = logits + mask[None, None, :, :]  # [head_kv, groups, q_len, k_len]
+
+        # Apply sink-adjusted softmax
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(sinks_expanded, logits_max)
+        sinks_exp = torch.exp(sinks_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks_exp
+        scores = unnormalized_scores / normalizer
+
+        # Compute output
+        out = torch.einsum("hgqk,khgd->qhgd", scores, v_seq.float())
+        out = out.reshape(q_len, num_heads, head_dim).to(q_unpad.dtype)
+
+        output[q_start:q_end] = out
+
+    return output
+
+
+def main(
+    batch: int = 1,
+    heads: int = 64,
+    q_seqlen: int = 2048,
+    k_seqlen: int = 2048,
+    dim: int = 128,
+    groups: int = 16,
+    is_causal: bool = True,
+    window_size: Optional[int] = None,
+):
+    assert heads % groups == 0, "heads must be divisible by groups"
+
+    flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
+    total_flops = 2 * flops_per_matmul
+
+    tilelang.testing.set_random_seed(0)
+
+    if is_causal:
+        total_flops *= 0.5
+
+    if window_size is not None:
+        print(f"Using sliding window attention with window_size={window_size}")
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, k_seqlen // 2) * q_seqlen * dim
+        total_flops = 2 * flops_per_matmul
+
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    head_kv = heads // groups
+    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device)
+    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    sinks = torch.randn(heads, dtype=dtype, device=device)
+
+    query_padding_mask = generate_random_padding_mask(q_seqlen, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(k_seqlen, batch, device, mode="random")
+
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+
+    UQ = q_unpad.shape[0]
+    UKV = k_unpad.shape[0]
+
+    kernel = flashattn_sink(
+        batch, groups, UQ, UKV, heads, dim, is_causal, window_size=window_size, block_M=128, block_N=128, num_stages=2, threads=256
+    )
+
+    out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, sinks)
+    out = output_pad_fn(out_unpad)
+
+    # Reference implementation
+    ref_out_unpad = ref_program(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        sinks,
+        batch,
+        is_causal,
+        sliding_window=window_size,
+        groups=groups,
+    )
+    ref_out = output_pad_fn(ref_out_unpad)
+
+    torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=1e-2)
+
+    print("All checks passed.✅")
+    latency = do_bench(
+        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, sinks),
+        warmup=500,
+    )
+    print("Tile-lang: {:.2f} ms".format(latency))
+    print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
+    parser.add_argument("--window_size", type=int, default=None, help="sliding window size (default: None for full attention)")
+    args = parser.parse_args()
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal, args.window_size)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 866668e41..66905f55d 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -20,40 +20,42 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        window_size=None,  # None for full attention,
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    window_size=None,  # None for full attention,
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -69,8 +71,7 @@ def flash_fwd(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([heads], dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -78,31 +79,30 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -119,32 +119,33 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -153,49 +154,52 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(
     batch,
     heads,
@@ -203,32 +207,31 @@ def flashattn_bwd(
     dim,
     window_size=None,  # None for full attention
     sm_scale=None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -252,43 +255,43 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, B=do, C=dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -297,51 +300,48 @@ def flash_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=128) as (bx, by, bz):
-            sink = T.alloc_local([1], dtype)
             lse_fragment = T.alloc_fragment([block], accum_dtype)
             delta_fragment = T.alloc_fragment([block], accum_dtype)
             dsink_fragment = T.alloc_fragment([block], accum_dtype)
 
-            sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size):
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -359,7 +359,7 @@ def maybe_contiguous(x):
             return x
 
         do, q, k, v, sinks, o = [maybe_contiguous(x) for x in (do, q, k, v, sinks, o)]
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         delta = kernel_prep(o, do)
@@ -381,15 +381,15 @@ def maybe_contiguous(x):
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -424,29 +424,23 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 1,
-         N_CTX: int = 512,
-         D_HEAD: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: T.dtype = T.float16):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     K = torch.randn_like(Q).requires_grad_()
     V = torch.randn_like(Q).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device=Q.device).requires_grad_()
@@ -468,19 +462,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -499,18 +488,53 @@ def tl_bwd():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 512,
+    D_HEAD: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    with torch.no_grad():
+        Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        K = torch.randn_like(Q)
+        V = torch.randn_like(Q)
+        sinks = torch.randn(H, dtype=torch_dtype, device=Q.device)
+        dO = torch.randn_like(Q)
+        fwd = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size=window_size, dtype=dtype)
+        O, lse = fwd(Q, K, V, sinks)
+
+        def maybe_contiguous(x):
+            return x if x.stride(-1) == 1 else x.contiguous()
+
+        do, q, k, v, sinks_c, o = [maybe_contiguous(x) for x in (dO, Q, K, V, sinks, O)]
+        k_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
+        Delta = k_prep(o, do)
+        k_bwd = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
+        k_dsink = flashattn_bwd_dsink(BATCH, H, N_CTX, dtype=dtype)
+        shape = (BATCH, H, N_CTX, D_HEAD)
+        dq = torch.zeros(shape, dtype=torch.float32, device=Q.device)
+        dk = torch.empty(shape, dtype=torch_dtype, device=Q.device)
+        dv = torch.empty(shape, dtype=torch_dtype, device=Q.device)
+        k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+        _ = k_dsink(sinks_c, Delta, lse).sum(0).sum(1)
+
+        def run_kernel_only():
+            k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+
+        latency_ms = do_bench(run_kernel_only, backend="cupti")
+        return latency_ms
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 2449b090c..f24aa38b7 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -5,7 +5,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -18,117 +17,45 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -145,53 +72,76 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Modified from https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -226,41 +176,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 1,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 1,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -287,19 +232,17 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
-        latency = do_bench(
-            lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
+        latency = do_bench(lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
         print("Ref: {:.2f} ms".format(latency))
         print("Ref: {:.2f} TFlops".format(total_flops / latency * 1e-9))
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -307,21 +250,37 @@ def main(batch: int = 1,
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch, heads, seq_q, seq_kv, dim, window_size, block_M=block_M, block_N=block_N, num_stages=num_stages, threads=threads, dtype=dtype
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index 352844075..b47c8175f 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -6,7 +6,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -19,119 +18,46 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=128,
+    block_N=128,
+    num_stages=2,
+    threads=256,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -148,60 +74,84 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function'sinterface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function'sinterface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -236,41 +186,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -297,15 +242,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -313,21 +257,38 @@ def main(batch: int = 1,
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch, heads, seq_q, seq_kv, dim, window_size, block_M=block_M, block_N=block_N, num_stages=num_stages, threads=threads, dtype=dtype
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/regression_attention_sink.py b/examples/attention_sink/regression_attention_sink.py
new file mode 100644
index 000000000..e2453173c
--- /dev/null
+++ b/examples/attention_sink/regression_attention_sink.py
@@ -0,0 +1,64 @@
+import tilelang.testing
+import example_mha_sink_fwd_bhsd
+import example_mha_sink_fwd_bhsd_wgmma_pipelined
+import example_mha_sink_bwd_bhsd
+import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_bhsd_wgmma_pipelined
+
+
+def regression_example_mha_sink_fwd_bhsd():
+    tilelang.testing.process_func(example_mha_sink_fwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_sink_fwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_fwd_bhsd.run_regression_perf, "regression_example_mha_sink_fwd_bhsd_sliding_window", window_size=128
+    )
+
+
+def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
+        "regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
+        window_size=128,
+    )
+
+
+def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.testing.process_func(
+        example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
+        "regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
+        window_size=128,
+    )
+
+
+def regression_example_mha_sink_bwd_bhsd():
+    tilelang.testing.process_func(example_mha_sink_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_sink_bwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_bwd_bhsd.run_regression_perf, "regression_example_mha_sink_bwd_bhsd_sliding_window", window_size=128
+    )
+
+
+def regression_example_gqa_sink_bwd_bhsd():
+    tilelang.testing.process_func(example_gqa_sink_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_gqa_sink_bwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_gqa_sink_bwd_bhsd.run_regression_perf, "regression_example_gqa_sink_bwd_bhsd_sliding_window", window_size=128
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/attention_sink/test_example_attention_sink.py b/examples/attention_sink/test_example_attention_sink.py
index 57242c199..31a1ff1b3 100644
--- a/examples/attention_sink/test_example_attention_sink.py
+++ b/examples/attention_sink/test_example_attention_sink.py
@@ -5,6 +5,8 @@
 import example_gqa_sink_fwd_bhsd_wgmma_pipelined
 import example_mha_sink_bwd_bhsd
 import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_varlen
+import example_gqa_sink_bwd_varlen
 
 
 @tilelang.testing.requires_cuda
@@ -61,5 +63,12 @@ def test_example_gqa_sink_bwd_bhsd_sliding_window():
     example_gqa_sink_bwd_bhsd.main(window_size=128)
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_gqa_sink_varlen():
+    example_gqa_sink_fwd_varlen.main()  # non-causal
+    example_gqa_sink_bwd_varlen.main()  # causal
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/autodd/README.md b/examples/autodd/README.md
new file mode 100644
index 000000000..9ae9f9816
--- /dev/null
+++ b/examples/autodd/README.md
@@ -0,0 +1,126 @@
+# AutoDD - Automatic Delta Debugging for TileLang
+
+AutoDD (Automatic Delta Debugging) is a built-in debugging tool for TileLang that automatically simplifies complex Python programs to the minimal code needed to reproduce a specific error. This is extremely useful for debugging large, complex TileLang programs.
+
+## What is Delta Debugging?
+
+Delta Debugging is an automated debugging technique with the core idea:
+1. Given a program that triggers a bug
+2. Systematically remove code fragments from the program
+3. Check if the simplified program still triggers the same bug
+4. Eventually obtain the minimal code that triggers the bug
+
+AutoDD uses a Probability Distribution Driven Delta Debugging (PDD) algorithm for efficient search of minimized code.
+
+## Why AutoDD?
+
+When developing TileLang programs, bugs are often hidden in complex code:
+
+- **Lots of irrelevant code**: Real projects may have hundreds of lines of configuration, helper functions, logging, etc.
+- **Hard to locate**: Error messages may point to underlying TVM/CUDA rather than TileLang code
+- **Tedious debugging**: Manually deleting code to locate bugs is very time-consuming
+
+AutoDD automates this process, reducing hundreds of lines of code to just a few dozen, directly exposing the root cause of the problem.
+
+## Usage
+
+### Basic Usage
+
+```bash
+python -m tilelang.autodd <source_file> --err-msg "<error_message>" -o <output_file>
+```
+
+### Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `source` | Path to the input Python source file |
+| `--err-msg` | Error message to match (searched in stdout or stderr) |
+| `-o, --output` | Path to the minimized output file |
+| `--backend` | Execution backend: `runner` (faster) or `subproc` (more stable), default `runner` |
+| `--timeout` | Timeout for each task in seconds, default 60 |
+| `-j, --jobs` | Number of parallel jobs, default 1 |
+
+### Example
+
+Run AutoDD on `tilelang_buggy.py` in this directory:
+
+```bash
+# Use 4 parallel jobs, search for "Dimension mismatch" error
+python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py -j 4
+
+# Or use subprocess backend (more stable but slower)
+python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py --backend subproc
+```
+
+## Example Files
+
+### `tilelang_buggy.py`
+
+A complex TileLang program with a bug (~200 lines), containing:
+- Multiple useless helper functions (`calculate_optimal_block_size`, `get_memory_requirements`, etc.)
+- A complex configuration class (`MatmulConfig`)
+- Unused benchmark code (`benchmark_pytorch`)
+- **A GEMM shape mismatch bug**
+
+The bug is on line 124:
+```python
+B_shared = T.alloc_shared((block_M, block_N), dtype)  # Wrong! Should be (block_K, block_N)
+```
+
+### `tilelang_minimized_expected.py`
+
+The expected output after AutoDD simplification (~30 lines). The simplified code clearly shows the root cause of the bug:
+
+```python
+def buggy_matmul(...):
+    @T.prim_func
+    def matmul_kernel():
+        with T.Kernel():
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug!
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.gemm(A_shared, B_shared, C_local)  # Error occurs here
+```
+
+## How AutoDD Works
+
+AutoDD uses AST (Abstract Syntax Tree) analysis and multiple rewrite rules to simplify code:
+
+### 1. Fast Reducers
+- **Statement removal**: Directly remove statements that don't affect bug reproduction
+- **If statement simplification**: Simplify `if cond: body` to `body`
+- **For loop simplification**: Bind loop variables to constants
+
+### 2. Canonicalizers
+- **With statement expansion**: Convert `with expr as var` to explicit assignment
+- **Function argument extension**: Add `*args, **kwargs` for compatibility
+
+### 3. Simplifiers
+- **Assignment simplification**: Replace complex expressions with constants
+- **Function call simplification**: Simplify `f(x)` to `x`
+- **Binary operation simplification**: Simplify `a + b` to `a` or `b`
+
+### 4. Slow Reducers
+- **Expression removal**: Remove arbitrary expressions
+- **Argument removal**: Remove function arguments
+- **Integer reduction**: Gradually reduce large integers
+
+## Use Cases
+
+1. **TileLang kernel debugging**: Simplify complex TileLang programs to locate bugs
+2. **Bug report submission**: Generate minimal reproduction code for easier issue tracking
+3. **Understanding errors**: Easier to understand the nature of errors after removing irrelevant code
+4. **Regression testing**: Simplified code can serve as regression test cases
+
+## Notes
+
+1. **Error message matching**: The `--err-msg` parameter needs to exactly match a string in the error output
+2. **Timeout setting**: For programs with long compilation times, you may need to increase `--timeout`
+3. **Parallel jobs**: Increasing `-j` can speed up the simplification process but consumes more resources
+4. **Backend selection**: If the `runner` backend is unstable, try the `subproc` backend
+
+## References
+
+- [Delta Debugging Paper](https://www.st.cs.uni-saarland.de/papers/tse2002/)
+- [TileLang Documentation](https://github.com/tile-ai/tilelang)
diff --git a/examples/autodd/tilelang_buggy.py b/examples/autodd/tilelang_buggy.py
new file mode 100644
index 000000000..d2c5469bb
--- /dev/null
+++ b/examples/autodd/tilelang_buggy.py
@@ -0,0 +1,229 @@
+"""
+A complex TileLang program with lots of redundant code and a bug that triggers an error.
+AutoDD will simplify it to the minimal code needed to reproduce the error.
+
+This example demonstrates how AutoDD can help developers quickly isolate bugs
+in complex TileLang programs by automatically removing irrelevant code.
+
+To run AutoDD on this file:
+    python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py -j 4
+
+The bug in this file: B_shared has shape (block_M, block_N) instead of (block_K, block_N),
+causing a GEMM dimension mismatch error.
+"""
+
+import tilelang
+import tilelang.language as T
+import torch
+
+
+# Useless helper function - will be removed by AutoDD
+def calculate_optimal_block_size(M, N, K):
+    """Calculate optimal block size - this function is completely useless"""
+    options = [32, 64, 128, 256]
+    best = 128
+    for opt in options:
+        if M % opt == 0 and N % opt == 0:
+            best = opt
+            break
+    return best, best, 32
+
+
+def get_memory_requirements(M, N, K, block_M, block_N, block_K, dtype_size=2):
+    """Calculate memory requirements - completely useless"""
+    shared_mem_a = block_M * block_K * dtype_size
+    shared_mem_b = block_K * block_N * dtype_size
+    total_shared = shared_mem_a + shared_mem_b
+    return total_shared
+
+
+def validate_parameters(M, N, K, block_M, block_N, block_K):
+    """Validate parameters - redundant check"""
+    if M <= 0 or N <= 0 or K <= 0:
+        raise ValueError("Matrix dimensions must be positive")
+    if block_M <= 0 or block_N <= 0 or block_K <= 0:
+        raise ValueError("Block sizes must be positive")
+    if M % block_M != 0:
+        print(f"Warning: M ({M}) not divisible by block_M ({block_M})")
+    if N % block_N != 0:
+        print(f"Warning: N ({N}) not divisible by block_N ({block_N})")
+    if K % block_K != 0:
+        print(f"Warning: K ({K}) not divisible by block_K ({block_K})")
+    return True
+
+
+class MatmulConfig:
+    """Configuration class - increases code complexity but is actually useless"""
+
+    def __init__(self, M, N, K):
+        self.M = M
+        self.N = N
+        self.K = K
+        self.block_M = 128
+        self.block_N = 128
+        self.block_K = 32
+        self.num_stages = 3
+        self.threads = 128
+        self.dtype = "float16"
+        self.accum_dtype = "float32"
+
+    def get_grid_size(self):
+        grid_x = (self.N + self.block_N - 1) // self.block_N
+        grid_y = (self.M + self.block_M - 1) // self.block_M
+        return grid_x, grid_y
+
+    def get_shared_memory_size(self):
+        return get_memory_requirements(self.M, self.N, self.K, self.block_M, self.block_N, self.block_K)
+
+    def validate(self):
+        return validate_parameters(self.M, self.N, self.K, self.block_M, self.block_N, self.block_K)
+
+
+def create_reference_output(a, b, activation="relu"):
+    """Create reference output - not actually used in verification"""
+    result = a @ b
+    if activation == "relu":
+        result = torch.relu(result)
+    elif activation == "gelu":
+        result = torch.nn.functional.gelu(result)
+    elif activation == "sigmoid":
+        result = torch.sigmoid(result)
+    return result
+
+
+def benchmark_pytorch(M, N, K, num_iters=10, warmup=5):
+    """PyTorch benchmark - not used"""
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    # Warmup
+    for _ in range(warmup):
+        _ = a @ b
+    torch.cuda.synchronize()
+
+    # Benchmark
+    import time
+
+    start = time.time()
+    for _ in range(num_iters):
+        _ = a @ b
+    torch.cuda.synchronize()
+    end = time.time()
+
+    return (end - start) / num_iters * 1000  # ms
+
+
+# Main TileLang kernel - contains a BUG: GEMM shape mismatch!
+@tilelang.jit
+def buggy_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def matmul_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            # Allocate shared memory
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            # BUG: the first dimension of B_shared should be block_K, but block_M is used here!
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Wrong shape!
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Allocate some useless temp variables
+            temp_buffer = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Zero out
+            T.clear(C_local)
+            T.clear(temp_buffer)
+
+            # Main loop
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy a tile of A
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy a tile of B - shape can mismatch here too
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # GEMM computation - shape mismatch will cause an error
+                # A_shared: (block_M, block_K)
+                # B_shared: (block_M, block_N) <- should be (block_K, block_N)
+                T.gemm(A_shared, B_shared, C_local)
+
+            # ReLU activation
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Some useless postprocessing
+            for i, j in T.Parallel(block_M, block_N):
+                if temp_buffer[i, j] > 0:
+                    C_local[i, j] = C_local[i, j] + 0.0
+
+            # Write back result
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_kernel
+
+
+def run_kernel(config):
+    """Run kernel - includes extra redundant logic"""
+    # Validate parameters
+    config.validate()
+
+    # Get config
+    M, N, K = config.M, config.N, config.K
+    block_M, block_N, block_K = config.block_M, config.block_N, config.block_K
+
+    # Calculate some useless statistics
+    grid_size = config.get_grid_size()
+    shared_mem = config.get_shared_memory_size()
+    print(f"Grid size: {grid_size}")
+    print(f"Shared memory: {shared_mem} bytes")
+
+    # Create test data
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+
+    # Compile and run kernel - will trigger the BUG here
+    kernel = buggy_matmul(M, N, K, block_M, block_N, block_K)
+    kernel(a, b, c)
+
+    # Validate results (if it can get here)
+    ref_c = torch.relu(a @ b)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("Kernel output matches PyTorch reference.")
+
+    return c
+
+
+def main():
+    # Useless printing
+    print("=" * 60)
+    print("TileLang Matmul Kernel Test")
+    print("=" * 60)
+
+    # Create config
+    M, N, K = 512, 512, 512
+    config = MatmulConfig(M, N, K)
+
+    # Calculate some useless values
+    optimal_block = calculate_optimal_block_size(M, N, K)
+    print(f"Optimal block size: {optimal_block}")
+
+    # Run PyTorch benchmark - result is not used
+    # pytorch_time = benchmark_pytorch(M, N, K)
+    # print(f"PyTorch time: {pytorch_time:.3f} ms")
+
+    # Run our kernel - will trigger the error here
+    try:
+        result = run_kernel(config)
+        print(f"Result shape: {result.shape}")
+    except Exception as e:
+        print(f"Error: {e}")
+        raise
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/autodd/tilelang_minimized_expected.py b/examples/autodd/tilelang_minimized_expected.py
new file mode 100644
index 000000000..3dc88f992
--- /dev/null
+++ b/examples/autodd/tilelang_minimized_expected.py
@@ -0,0 +1,49 @@
+"""
+This is the expected output after running AutoDD on tilelang_buggy.py.
+AutoDD automatically simplified the 200+ line buggy program to ~30 lines
+while preserving the ability to reproduce the error.
+
+The minimized code clearly shows the root cause of the bug:
+- A_shared has shape (block_M, block_K)
+- B_shared has shape (block_M, block_N) - should be (block_K, block_N)
+- This causes a dimension mismatch in T.gemm()
+"""
+
+import tilelang.language as T
+
+
+class MatmulConfig:
+    def __init__(self, *args, **kwargs):
+        self.M = 1
+        self.N = 1
+        self.K = 1
+        self.block_M = 2
+        self.block_N = 1
+        self.block_K = 1
+
+
+def buggy_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, *args, **kwargs):
+    @T.prim_func
+    def matmul_kernel():
+        with T.Kernel():
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug: should be (block_K, block_N)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.gemm(A_shared, B_shared, C_local)
+
+
+def run_kernel(config, *args, **kwargs):
+    M, N, K = (config.M, config.N, config.K)
+    block_M, block_N, block_K = (config.block_M, config.block_N, config.block_K)
+    buggy_matmul(M, N, K, block_M, block_N, block_K)
+
+
+def main(*args, **kwargs):
+    config = MatmulConfig()
+    try:
+        run_kernel(config)
+    except Exception as e:
+        print(f"{e}")
+
+
+main()
diff --git a/examples/bitnet-1.58b/.gitignore b/examples/bitnet-1.58b/.gitignore
index 6ea887496..2bcdfd92b 100644
--- a/examples/bitnet-1.58b/.gitignore
+++ b/examples/bitnet-1.58b/.gitignore
@@ -1 +1 @@
-models/
\ No newline at end of file
+models/
diff --git a/examples/bitnet-1.58b/README.md b/examples/bitnet-1.58b/README.md
index 2b587eab4..b9898741b 100644
--- a/examples/bitnet-1.58b/README.md
+++ b/examples/bitnet-1.58b/README.md
@@ -2,7 +2,6 @@
 license: mit
 ---
 
-
 This is a Tilelang Implementation for the reproduced 1.58bit model from [1bitLLM/bitnet_b1_58-3B](https://huggingface.co/1bitLLM/bitnet_b1_58-3B). We replaced the original simulated Int8x3bit Quantized Inference Kernel with INT8xINT2 Kernel. We also evaluated the model's correctness and performance through `eval_correctness.py` and `benchmark_inference_latency.py`.
 
 ## Make Checkpoints for vLLM
@@ -43,7 +42,6 @@ python3 inference_with_bitblas_format.py
 | bitnet-3b-1.58bits     | vllm-tilelang            | 379.25         | 117.43          | 752.55         |
 | bitnet-3b-1.58bits     | vllm-tilelang-cuda-graph | 2543.58        | 1621.08         | 2731.79        |
 
-
 ## BitBLAS Results
 
 ### Performance
@@ -94,4 +92,4 @@ The differences between the reported numbers and the reproduced results are poss
   journal={arXiv preprint arXiv:2402.17764},
   year={2024}
 }
-```
\ No newline at end of file
+```
diff --git a/examples/bitnet-1.58b/benchmark.sh b/examples/bitnet-1.58b/benchmark.sh
index 6a2550d45..839443dc6 100755
--- a/examples/bitnet-1.58b/benchmark.sh
+++ b/examples/bitnet-1.58b/benchmark.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python benchmark_generate.py --bs 16 --in_seq_len 32 --out_seq_len 128 | tee b16_i32_o128.log
 
 python benchmark_generate.py --bs 1 --in_seq_len 512 --out_seq_len 64 | tee b1_i512_o64.log
diff --git a/examples/bitnet-1.58b/benchmark_generate.py b/examples/bitnet-1.58b/benchmark_generate.py
index d6f21ed50..d678b91a4 100644
--- a/examples/bitnet-1.58b/benchmark_generate.py
+++ b/examples/bitnet-1.58b/benchmark_generate.py
@@ -12,8 +12,7 @@
 
 def generate_text_batch(model, tokenizer, prompts, max_length=100):
     # Encode the input prompts as a batch
-    input_ids = tokenizer(
-        prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
+    input_ids = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
 
     # Generate cos and sin values (commented out as not used in generation)
     seq_length = input_ids.size(1)
@@ -37,9 +36,7 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
     end_time = time.time()
 
     # Decode the output ids to text
-    generated_texts = [
-        tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids
-    ]
+    generated_texts = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]
 
     generation_time = end_time - start_time
     num_tokens = sum(len(output_id) for output_id in output_ids)
@@ -52,8 +49,8 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -74,25 +71,29 @@ def get_runtime(num_repeats=1):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--bs', default=16, type=int)
-    parser.add_argument('--in_seq_len', default=32, type=int)
-    parser.add_argument('--out_seq_len', default=128, type=int)
-    parser.add_argument('--bitblas', action='store_true')
+    parser.add_argument("--bs", default=16, type=int)
+    parser.add_argument("--in_seq_len", default=32, type=int)
+    parser.add_argument("--out_seq_len", default=128, type=int)
+    parser.add_argument("--bitblas", action="store_true")
     args = parser.parse_args()
     bs = args.bs
     in_seq_len = args.in_seq_len
     out_seq_len = args.out_seq_len
     is_bitblas = args.bitblas
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     if is_bitblas:
         with torch.no_grad():
             model.quantize()
@@ -109,5 +110,5 @@ def main():
     print(generate_text_batch(model, tokenizer, prompts, max_length=max_length))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/benchmark_inference_latency.py b/examples/bitnet-1.58b/benchmark_inference_latency.py
index 9ce7a3898..788fc5565 100644
--- a/examples/bitnet-1.58b/benchmark_inference_latency.py
+++ b/examples/bitnet-1.58b/benchmark_inference_latency.py
@@ -6,13 +6,14 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,8 +36,8 @@ def get_runtime(num_repeats=1):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
@@ -52,5 +53,5 @@ def main():
         print(f"Batch size: {batch_size}, Seq len: {seq_len}, Latency: {latency}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/configuration_bitnet.py b/examples/bitnet-1.58b/configuration_bitnet.py
index 5f4937b87..63c499db3 100644
--- a/examples/bitnet-1.58b/configuration_bitnet.py
+++ b/examples/bitnet-1.58b/configuration_bitnet.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LLaMA model configuration"""
+"""LLaMA model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -180,16 +180,10 @@ def _rope_scaling_validation(self):
             return
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}")
+            raise ValueError(f"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, got {self.rope_scaling}")
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor,
-                                                         float) or rope_scaling_factor <= 1.0:
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/examples/bitnet-1.58b/eval_correctness.py b/examples/bitnet-1.58b/eval_correctness.py
index ac1e34072..11d47004b 100644
--- a/examples/bitnet-1.58b/eval_correctness.py
+++ b/examples/bitnet-1.58b/eval_correctness.py
@@ -47,8 +47,8 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -69,18 +69,22 @@ def get_runtime(num_repeats=1):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=False,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=False,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
 
     tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False)
-    input_id = tokenizer("Hello")['input_ids']
+    input_id = tokenizer("Hello")["input_ids"]
     input_id = torch.tensor(input_id).unsqueeze(0).cuda()
 
     print("original model generated text:")
@@ -91,5 +95,5 @@ def main():
     print(generate_text(model, tokenizer, "Hello", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_gpu_memory.py b/examples/bitnet-1.58b/eval_gpu_memory.py
index 597cbbfcd..00c914cb3 100644
--- a/examples/bitnet-1.58b/eval_gpu_memory.py
+++ b/examples/bitnet-1.58b/eval_gpu_memory.py
@@ -6,13 +6,14 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,17 +36,17 @@ def get_runtime(num_repeats=1):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
     ).half()
-    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024**3} GB")
     with torch.no_grad():
         model._post_process_weights()
-    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024**3} GB")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_ppl.py b/examples/bitnet-1.58b/eval_ppl.py
index 61c8488e4..97db2d0f5 100644
--- a/examples/bitnet-1.58b/eval_ppl.py
+++ b/examples/bitnet-1.58b/eval_ppl.py
@@ -15,9 +15,9 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--seed', default=0, type=int)
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
-parser.add_argument('--seqlen', default=2048, type=int)
+parser.add_argument("--seed", default=0, type=int)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
+parser.add_argument("--seqlen", default=2048, type=int)
 
 
 def calulate_loss(model, input, loss_fct):
@@ -29,12 +29,16 @@ def calulate_loss(model, input, loss_fct):
 
 
 def main(args):
-    datasets = ['c4', 'wikitext2']
-    model = BitnetForCausalLM.from_pretrained(
-        args.hf_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    datasets = ["c4", "wikitext2"]
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            args.hf_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     with torch.no_grad():
         model._post_process_weights()
     tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False)
@@ -48,9 +52,9 @@ def main(args):
         for ii in progress:
             input = torch.Tensor(testdata[ii]).long().cuda().view(1, -1)
             loss = calulate_loss(model, input, loss_fct)
-            count += (input.size(-1) - 1)
+            count += input.size(-1) - 1
             acc_loss += loss.item()
-            progress.set_description(f"avg_loss = {acc_loss/ count / math.log(2)}")
+            progress.set_description(f"avg_loss = {acc_loss / count / math.log(2)}")
 
         avg_loss = acc_loss / count / math.log(2)
         ppl.append(2**avg_loss)
@@ -60,7 +64,7 @@ def main(args):
     print("Avg PPL:", sum(ppl) / len(ppl))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     torch.set_grad_enabled(False)
     args = parser.parse_args()
     random.seed(args.seed)
diff --git a/examples/bitnet-1.58b/eval_utils.py b/examples/bitnet-1.58b/eval_utils.py
index 46241eedf..72480c392 100644
--- a/examples/bitnet-1.58b/eval_utils.py
+++ b/examples/bitnet-1.58b/eval_utils.py
@@ -15,21 +15,17 @@ def set_seed(seed):
 
 def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
     if dataset_name == "wikitext2":
-        testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-        testdata = "".join(testdata['text']).split('\n')
+        testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        testdata = "".join(testdata["text"]).split("\n")
     elif dataset_name == "c4":
-        testdata = load_dataset(
-            'allenai/c4',
-            data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
-            split='validation')['text']
+        testdata = load_dataset("allenai/c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation")[
+            "text"
+        ]
     else:
         raise NotImplementedError
 
     testdata = [item for item in testdata if item != ""]
-    tokenized_text = [
-        tokenizer(item, add_special_tokens=False)['input_ids'] + [tokenizer.eos_token_id]
-        for item in testdata
-    ]
+    tokenized_text = [tokenizer(item, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id] for item in testdata]
 
     data, doc = [], [tokenizer.bos_token_id]
     for sen in tokenized_text:
@@ -45,7 +41,6 @@ def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
 
 
 class LMEvalAdaptor(BaseLM):
-
     def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
         super().__init__()
 
@@ -137,5 +132,4 @@ def _model_call(self, inps):
         return out
 
     def _model_generate(self, context, max_length, eos_token_id):
-        return self.model.generate(
-            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
+        return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index e5af16cc4..a31261d3e 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -76,13 +76,13 @@ def bitnet_158_int8xint2_decode(
     reduce_thread=32,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     storage_nbit = 8
     num_bits = 2
@@ -94,7 +94,7 @@ def bitnet_158_int8xint2_decode(
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     micro_size_k = MAX_TRANSACTION_SIZE_IN_BITS // DataType(in_dtype).bits
     micro_size_k_compressed = micro_size_k // num_elems_per_byte
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     block_K = reduce_thread * micro_size_k
 
     use_dp4a = True
@@ -102,17 +102,17 @@ def bitnet_158_int8xint2_decode(
 
     @T.prim_func
     def kernel(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer(C_shape, out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer(C_shape, out_dtype),
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -133,8 +133,7 @@ def kernel(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 T.call_extern(
@@ -156,9 +155,9 @@ def kernel(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -168,7 +167,8 @@ def kernel(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -194,12 +194,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -209,7 +209,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -217,12 +217,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -234,13 +234,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_decode_correctness(M,
-                                                   N,
-                                                   K,
-                                                   in_dtype,
-                                                   out_dtype,
-                                                   accum_dtype,
-                                                   fast_decoding=True):
+def assert_bitnet_158_int8xint2_decode_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_decode(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -265,4 +259,4 @@ def assert_bitnet_158_int8xint2_decode_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index d8b1f6228..f4a60098a 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -8,11 +8,13 @@
 from tilelang import tvm as tvm
 from tvm import DataType
 from tilelang.intrinsics.mma_layout import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 import numpy as np
 
 from tilelang.intrinsics.mma_macro_generator import (
-    INT4TensorCoreIntrinEmitter,)
+    INT4TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 torch.manual_seed(42)
@@ -86,9 +88,9 @@ def bitnet_158_int8xint2_prefill(
     Create a TVM GPU prim_func implementing a block-tiled matrix multiply that multiplies dense A by compressed/interleaved low‑precision B (2-bit packed into int8 storage), decoding B to int8 on-chip and accumulating into C.
 
     The returned prim_func expects:
-    - A: shape (M, K) with dtype `in_dtype` ("float16" or "int8").
+    - A: shape (M, K) with dtype `in_dtype` (T.float16 or T.int8).
     - B: compressed storage with shape (N, K/4) and int8 storage layout (packing 4 2-bit elements per byte).
-    - C: output buffer shape (M, N) with dtype `out_dtype` ("float16", "float32", or "int32").
+    - C: output buffer shape (M, N) with dtype `out_dtype` (T.float16, T.float32, or T.int32).
 
     Details:
     - Builds a tiled, pipelined kernel using shared memory and warp-level MMA intrinsics (INT4TensorCoreIntrinEmitter). B is loaded from compressed storage, decoded to int8 in threads (via decode_i2u_to_i8s / decode_i2s_to_i8s), and dequantized into a shared buffer used by the MMA emitter.
@@ -96,15 +98,15 @@ def bitnet_158_int8xint2_prefill(
       - block_row_warps, block_col_warps: number of warps per block in row/col.
       - warp_row_tiles, warp_col_tiles: tiles per warp.
       - chunk: K-sized chunk per block (block_K).
-      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == "int32").
+      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == T.int32).
     - Uses 2-stage pipelining by default to overlap loads and compute and applies a swizzle layout to improve L2 behavior.
     - Assertions: raises AssertionError if in_dtype or out_dtype are not among supported values.
 
     Parameters:
         M, N, K (int): Global matrix dimensions.
-        in_dtype (str): Input and decoded B element dtype; "float16" or "int8".
-        out_dtype (str): Output C dtype; one of "float16", "float32", "int32".
-        accum_dtype (str): Accumulator dtype used by MMA (e.g., "int32").
+        in_dtype (str): Input and decoded B element dtype; T.float16 or T.int8.
+        out_dtype (str): Output C dtype; one of T.float16, T.float32, T.int32.
+        accum_dtype (str): Accumulator dtype used by MMA (e.g., T.int32).
         fast_decoding (bool): If True, enable the fast decoding path (affects which device decode is used).
         block_row_warps (int): Warps in block row dimension.
         block_col_warps (int): Warps in block column dimension.
@@ -116,18 +118,18 @@ def bitnet_158_int8xint2_prefill(
         T.prim_func: A TVM prim_func implementing the described GPU kernel suitable for compilation and execution.
     """
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     num_elems_per_byte = 4
@@ -136,7 +138,7 @@ def bitnet_158_int8xint2_prefill(
     local_size_compressed = local_size // num_elems_per_byte
 
     shared_scope = "shared.dyn"
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     # Pipeline Stage
     stage = 2
@@ -181,38 +183,36 @@ def bitnet_158_int8xint2_prefill(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         """
-            GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
+        GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
 
-            This kernel:
-            - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
-            - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
-            - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
-            - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
+        This kernel:
+        - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
+        - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
+        - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
+        - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
 
-            Parameters:
-                A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
-                B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
-                C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
+        Parameters:
+            A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
+            B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
+            C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
 
-            Side effects:
-                Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
+        Side effects:
+            Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
         """
         with T.Kernel(
-                T.ceildiv(N, block_N),
-                T.ceildiv(M, block_M),
-                threads=threads,
-                prelude=decode_i2s_to_i8s,
+            T.ceildiv(N, block_N),
+            T.ceildiv(M, block_M),
+            threads=threads,
+            prelude=decode_i2s_to_i8s,
         ) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
-            B_dequantize_shared = T.alloc_shared(
-                B_dequantize_shared_shape, in_dtype, scope=shared_scope)
+            B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
             A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
             B_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
@@ -221,12 +221,14 @@ def main(
             B_local = T.alloc_local([local_size_compressed], storage_dtype)
             B_dequantize_local = T.alloc_local([local_size], in_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_bindings = T.get_thread_binding(0)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -234,7 +236,6 @@ def main(
             T.clear(C_frag)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -243,12 +244,9 @@ def main(
                 for j, k in T.Parallel(block_N, block_K // num_elems_per_byte):
                     B_shared[j, k] = B[bx * block_N + j, ko * (block_K // num_elems_per_byte) + k]
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
-                        index = (
-                            i * threads * local_size_compressed +
-                            thread_bindings * local_size_compressed + v)
+                        index = i * threads * local_size_compressed + thread_bindings * local_size_compressed + v
                         vi, vj = T.index_to_coordinates(index, B_shared_shape)
                         B_local[v] = B_shared[vi, vj]
 
@@ -260,12 +258,11 @@ def main(
                     )
 
                     for v in T.vectorized(0, local_size):
-                        index = (i * threads * local_size + thread_bindings * local_size + v)
+                        index = i * threads * local_size + thread_bindings * local_size + v
                         vi, vj = T.index_to_coordinates(index, B_dequantize_shared_shape)
                         B_dequantize_shared[vi, vj] = B_dequantize_local[v]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_frag,
@@ -320,12 +317,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -335,7 +332,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -343,12 +340,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -360,13 +357,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_prefill_correctness(M,
-                                                    N,
-                                                    K,
-                                                    in_dtype,
-                                                    out_dtype,
-                                                    accum_dtype,
-                                                    fast_decoding=True):
+def assert_bitnet_158_int8xint2_prefill_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_prefill(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -391,4 +382,4 @@ def assert_bitnet_158_int8xint2_prefill_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
index 986463598..e3d35df4b 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
@@ -6,7 +6,8 @@
 import tvm.tl.language as T
 from bitblas.tl.utils import get_swizzle_layout
 from bitblas.tl.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from bitblas.base import simplify_prim_func
 
 torch.manual_seed(0)
@@ -37,18 +38,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -56,7 +57,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -101,12 +102,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, in_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, in_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def main(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -127,7 +129,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -183,7 +183,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
     print(src_code)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-7, 7, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-7, 7, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -209,12 +209,12 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
     # bitblas.testing.main()
-    # assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    # assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(16384, 16384, 16384, "int8", "int32", "int32")
+    # assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    # assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(16384, 16384, 16384, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/load_from_quantized.py b/examples/bitnet-1.58b/load_from_quantized.py
index 26a32f974..8c775aa4c 100644
--- a/examples/bitnet-1.58b/load_from_quantized.py
+++ b/examples/bitnet-1.58b/load_from_quantized.py
@@ -49,7 +49,13 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 def main():
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
     # print("original model generated text:")
     # print(generate_text(model, tokenizer, "Hi, ", max_length=100))
diff --git a/examples/bitnet-1.58b/maint/README.md b/examples/bitnet-1.58b/maint/README.md
index 63cc3e275..6bccdf93a 100644
--- a/examples/bitnet-1.58b/maint/README.md
+++ b/examples/bitnet-1.58b/maint/README.md
@@ -2,7 +2,6 @@
 license: mit
 ---
 
-
 This is a BitBLAS Implementation for the reproduced 1.58bit model from [1bitLLM/bitnet_b1_58-3B](https://huggingface.co/1bitLLM/bitnet_b1_58-3B). We replaced the original simulated Int8x3bit Quantized Inference Kernel with BitBLAS INT8xINT2 Kernel. We also evaluated the model's correctness and performance through `eval_correctness.py` and `benchmark_inference_latency.py`.
 
 ## Latest News
@@ -88,4 +87,4 @@ The differences between the reported numbers and the reproduced results are poss
   journal={arXiv preprint arXiv:2402.17764},
   year={2024}
 }
-```
\ No newline at end of file
+```
diff --git a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
index 1e29a553a..2604ef387 100644
--- a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
+++ b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
@@ -25,9 +25,9 @@
 args = parser.parse_args()
 
 model_name_or_path = args.model_name_or_path
-saved_model_path = os.path.join(
-    dirpath, "models",
-    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+saved_model_path = (
+    os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+)
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):
@@ -67,7 +67,10 @@ def main():
             model_name_or_path,
             use_flash_attention_2=False,
             torch_dtype=torch.float16,
-        ).cuda().half())
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
 
     # print("original model generated text:")
@@ -112,10 +115,16 @@ def main():
         file_path = cached_file(model_name_or_path, file)
         os.system(f"cp {file_path} {saved_model_path}")
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     print("quantized model generated text:")
     print(generate_text(qmodel, tokenizer, "Hi, ", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
index 741c3a124..b0430588a 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # retrieve the native model input and saved model directory
 MODEL_DIR=$1
 SAVED_MODEL_DIR=$2
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
index a2df0eb8c..66356d3d8 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # require git lfs
 if ! command -v git-lfs &> /dev/null; then
     echo "Please install git-lfs first by running 'sudo apt install git-lfs'"
diff --git a/examples/bitnet-1.58b/maint/quantize_config.json b/examples/bitnet-1.58b/maint/quantize_config.json
index e2b24123a..80fbf02f0 100644
--- a/examples/bitnet-1.58b/maint/quantize_config.json
+++ b/examples/bitnet-1.58b/maint/quantize_config.json
@@ -7,4 +7,4 @@
     "model_name_or_path": "1bitLLM/bitnet_b1_58-3B",
     "quant_method": "bitnet",
     "checkpoint_format": "bitnet"
-}
\ No newline at end of file
+}
diff --git a/examples/bitnet-1.58b/maint/upload_models.sh b/examples/bitnet-1.58b/maint/upload_models.sh
index b764b0da6..7c6d76e32 100755
--- a/examples/bitnet-1.58b/maint/upload_models.sh
+++ b/examples/bitnet-1.58b/maint/upload_models.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 MODEL_DIR=$1
 REMOTE_DIR=$2
 
diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py
index 6e3c42b6f..1830995ee 100644
--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -64,8 +64,7 @@ def find_layers(module, layers=None, name=""):
             return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(
-            find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
     return res
 
 
@@ -87,7 +86,6 @@ def _get_unpad_data(attention_mask):
 
 
 class BitnetRMSNorm(nn.Module):
-
     def __init__(self, hidden_size, eps=1e-6):
         """
         BitnetRMSNorm is equivalent to T5LayerNorm
@@ -108,34 +106,23 @@ def forward(self, hidden_states):
 
 
 class BitnetRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         self.scaling_factor = scaling_factor
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (
-            self.base
-            **(torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         t = t / self.scaling_factor
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
-        self.register_buffer(
-            "_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
 
     @property
     def sin_cached(self):
@@ -156,14 +143,12 @@ def cos_cached(self):
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :,
-                                          None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type,
-                                                str) and device_type != "mps" else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
@@ -174,8 +159,8 @@ def forward(self, x, position_ids):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
@@ -207,7 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 
 class BitnetMLP(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -245,7 +229,6 @@ def forward(self, x):
 
 
 class BitnetMLPFuseGateUp(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -272,8 +255,7 @@ def __init__(self, config):
     def from_bit_mlp(cls, bit_mlp: BitnetMLP):
         module = cls(bit_mlp.config)
         # assign the weights
-        module.gate_up_proj.weight = nn.Parameter(
-            torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
+        module.gate_up_proj.weight = nn.Parameter(torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
         module.down_proj = bit_mlp.down_proj
         module.ffn_layernorm = bit_mlp.ffn_layernorm
         return module
@@ -295,8 +277,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -311,7 +292,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -325,8 +307,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.q_proj = BitLinear(
             self.hidden_size,
@@ -387,10 +369,8 @@ def forward(
         value_states = self.v_proj(hidden_states)
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -399,30 +379,24 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -448,7 +422,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -462,8 +437,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.qkv_proj = BitLinear(
             self.hidden_size,
@@ -497,17 +472,12 @@ def from_bit_attention(cls, bit_attention: BitnetAttention):
         module = cls(bit_attention.config, bit_attention.layer_idx)
         # assign the weights
         module.qkv_proj.weight = nn.Parameter(
-            torch.cat([
-                bit_attention.q_proj.weight, bit_attention.k_proj.weight,
-                bit_attention.v_proj.weight
-            ],
-                      dim=0))
+            torch.cat([bit_attention.q_proj.weight, bit_attention.k_proj.weight, bit_attention.v_proj.weight], dim=0)
+        )
         if bit_attention.q_proj.bias is not None and bit_attention.k_proj.bias is not None and bit_attention.v_proj.bias is not None:
             module.qkv_proj.bias = nn.Parameter(
-                torch.cat([
-                    bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias
-                ],
-                          dim=0))
+                torch.cat([bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias], dim=0)
+            )
         module.o_proj = bit_attention.o_proj
         module.inner_attn_ln = bit_attention.inner_attn_ln
         if bit_attention.config.rope_scaling is None:
@@ -528,16 +498,13 @@ def forward(
         bsz, q_len, _ = hidden_states.size()
         qkv_states = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = torch.split(
-            qkv_states, [
-                self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim,
-                self.num_key_value_heads * self.head_dim
-            ],
-            dim=-1)
+            qkv_states,
+            [self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim, self.num_key_value_heads * self.head_dim],
+            dim=-1,
+        )
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -546,30 +513,24 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -622,10 +583,8 @@ def forward(
         # batch_size x seq_length x head_dim x hidden_dim
         # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -635,8 +594,7 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
@@ -665,14 +623,14 @@ def forward(
             logger.warning_once(
                 f"The input hidden states seems to be silently casted in float32, this might be related to"
                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}.")
+                f" {target_dtype}."
+            )
 
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
+        attn_output = self._flash_attention_forward(query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
 
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.inner_attn_ln(attn_output)
@@ -683,14 +641,9 @@ def forward(
 
         return attn_output, attn_weights, past_key_value
 
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
@@ -720,7 +673,8 @@ def _flash_attention_forward(self,
         if attention_mask is not None:
             batch_size = query_states.shape[0]
             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length)
+                query_states, key_states, value_states, attention_mask, query_length
+            )
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -740,13 +694,7 @@ def _flash_attention_forward(self,
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
+            attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)
 
         return attn_output
 
@@ -754,28 +702,24 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
+            query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32,
-                device=query_layer.device)  # There is a memcpy here, that is very bad.
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask)
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -794,13 +738,11 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 
 
 class BitnetDecoderLayer(nn.Module):
-
     def __init__(self, config: BitnetConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
-            config=config, layer_idx=layer_idx)
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
 
         self.mlp = BitnetMLP(config)
         self.input_layernorm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -834,7 +776,8 @@ def forward(
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
         residual = hidden_states
 
@@ -925,8 +868,7 @@ def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] =
                 dtype = self.config._pre_quantization_dtype
             else:
                 dtype = layer.self_attn.o_proj.weight.dtype
-            layer.self_attn.past_key_value = cache_cls(
-                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
+            layer.self_attn.past_key_value = cache_cls(self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
 
     def _reset_cache(self):
         for layer in self.model.layers:
@@ -1025,9 +967,7 @@ def __init__(self, config: BitnetConfig):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([
-            BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList([BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
         self.norm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
 
@@ -1055,21 +995,15 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
 
         if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
+            logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
             use_cache = False
 
         if inputs_embeds is None:
@@ -1083,10 +1017,7 @@ def forward(
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1143,12 +1074,9 @@ def forward(
 
         next_cache = None
         if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache()
-                if isinstance(next_decoder_cache, Cache) else next_decoder_cache)
+            next_cache = next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                         if v is not None)
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1172,14 +1100,9 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
         if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
             target_length = self.config.max_position_embeddings
         else:  # dynamic cache
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1)
-
-        causal_mask = torch.full((sequence_length, target_length),
-                                 fill_value=min_dtype,
-                                 dtype=dtype,
-                                 device=device)
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -1188,10 +1111,8 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(
-                    0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
-                    padding_mask, min_dtype)
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
             elif attention_mask.dim() == 4:
                 # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                 # cache. In that case, the 4D attention mask attends to the newest tokens only.
@@ -1201,8 +1122,7 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
                     offset = 0
                 mask_shape = attention_mask.shape
                 mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-                causal_mask[:mask_shape[0], :mask_shape[1],
-                            offset:mask_shape[2] + offset, :mask_shape[3]] = mask_slice
+                causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = mask_slice
 
         return causal_mask
 
@@ -1279,9 +1199,7 @@ def forward(
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -1327,13 +1245,9 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      cache_position=None,
-                                      **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
+    ):
         # With static cache, the `past_key_values` is None
         # TODO joao: standardize interface for the different Cache classes and remove of this if
         has_static_cache = False
@@ -1344,13 +1258,13 @@ def prepare_inputs_for_generation(self,
         past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length()
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                 max_cache_length = (
                     torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                    if past_key_values.get_max_length() is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
             # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
@@ -1361,7 +1275,7 @@ def prepare_inputs_for_generation(self,
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1369,8 +1283,7 @@ def prepare_inputs_for_generation(self,
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None and
-                    cache_length + input_ids.shape[1] > max_cache_length):
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids")
@@ -1379,7 +1292,7 @@ def prepare_inputs_for_generation(self,
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -1392,39 +1305,38 @@ def prepare_inputs_for_generation(self,
 
         input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
         if cache_position is None:
-            cache_position = torch.arange(
-                past_length, past_length + input_length, device=input_ids.device)
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
         else:
             cache_position = cache_position[-input_length:]
 
         if has_static_cache:
             past_key_values = None
 
-        model_inputs.update({
-            "position_ids": position_ids,
-            "cache_position": cache_position,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        })
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
 
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past),)
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
     @staticmethod
     def recursive_set(model, name, attr):
-        '''
-            set layers.25.mlp.up_proj to attr
-        '''
+        """
+        set layers.25.mlp.up_proj to attr
+        """
 
-        names = name.split('.')
+        names = name.split(".")
         obj = model
         for n in names[:-1]:
             obj = getattr(obj, n)
@@ -1521,6 +1433,7 @@ def from_quantized(
         fuse_gateup = quant_config.get("fuse_gateup", True)
 
         import accelerate
+
         if checkpoint_format == "bitblas":
             model = cls(config)
             for name, module in model.named_modules():
@@ -1567,7 +1480,6 @@ def from_quantized(
     LLAMA_START_DOCSTRING,
 )
 class BitnetForSequenceClassification(BitnetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1631,8 +1543,7 @@ def forward(
         else:
             if input_ids is not None:
                 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids,
-                                            self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                 sequence_lengths = sequence_lengths % input_ids.shape[-1]
                 sequence_lengths = sequence_lengths.to(logits.device)
             else:
@@ -1646,8 +1557,7 @@ def forward(
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or
-                                              labels.dtype == torch.int):
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
diff --git a/examples/bitnet-1.58b/nvidia_measure_memory.sh b/examples/bitnet-1.58b/nvidia_measure_memory.sh
index e8998f309..82cf4855f 100755
--- a/examples/bitnet-1.58b/nvidia_measure_memory.sh
+++ b/examples/bitnet-1.58b/nvidia_measure_memory.sh
@@ -1 +1,3 @@
+#!/usr/bin/env bash
+
 nvidia-smi --query-gpu=memory.used --format=csv -lms 500
diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py
index 6fea3252a..2adfd6dee 100644
--- a/examples/bitnet-1.58b/tokenization_bitnet.py
+++ b/examples/bitnet-1.58b/tokenization_bitnet.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -37,12 +38,10 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
     },
     "tokenizer_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
     },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -159,14 +158,10 @@ def __init__(
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(
-            bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(
-            eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(
-            unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(
-            pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
 
         if legacy is None:
             logger.warning_once(
@@ -174,7 +169,8 @@ def __init__(
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behavior, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565")
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
             legacy = True
 
         self.legacy = legacy
@@ -214,8 +210,7 @@ def get_spm_processor(self, from_slow=False):
 
         with open(self.vocab_file, "rb") as f:
             sp_model = f.read()
-            model_pb2 = import_protobuf(
-                f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
+            model_pb2 = import_protobuf(f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
             model = model_pb2.ModelProto.FromString(sp_model)
             normalizer_spec = model_pb2.NormalizerSpec()
             normalizer_spec.add_dummy_prefix = False
@@ -261,8 +256,7 @@ def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
 
         tokens = super().tokenize(text, **kwargs)
 
-        if len(tokens
-              ) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
             tokens = tokens[1:]
         return tokens
 
@@ -284,7 +278,7 @@ def _tokenize(self, text, **kwargs):
         # 1. Encode string + prefix ex: "<unk> Hey"
         tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length:] if len(tokens) >= self.unk_token_length else tokens
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
@@ -332,12 +326,9 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
-        out_vocab_file = os.path.join(save_directory,
-                                      (filename_prefix + "-" if filename_prefix else "") +
-                                      VOCAB_FILES_NAMES["vocab_file"])
+        out_vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
-                self.vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
@@ -357,10 +348,9 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
         return output
 
-    def get_special_tokens_mask(self,
-                                token_ids_0: List[int],
-                                token_ids_1: Optional[List[int]] = None,
-                                already_has_special_tokens: bool = False) -> List[int]:
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
@@ -377,20 +367,16 @@ def get_special_tokens_mask(self,
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
+            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
 
         bos_token_id = [1] if self.add_bos_token else []
         eos_token_id = [1] if self.add_eos_token else []
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id +
-                ([0] * len(token_ids_1)) + eos_token_id)
+        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
 
-    def create_token_type_ids_from_sequences(self,
-                                             token_ids_0: List[int],
-                                             token_ids_1: Optional[List[int]] = None) -> List[int]:
+    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
@@ -473,9 +459,9 @@ def default_chat_template(self):
             "{% elif message['role'] == 'assistant' %}"
             "{{ ' '  + content.strip() + ' ' + eos_token }}"
             "{% endif %}"
-            "{% endfor %}")
-        template = template.replace("USE_DEFAULT_PROMPT",
-                                    "true" if self.use_default_system_prompt else "false")
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
         default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
         template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
 
diff --git a/examples/bitnet-1.58b/utils_quant.py b/examples/bitnet-1.58b/utils_quant.py
index 5f5db5dbc..5a50edb39 100644
--- a/examples/bitnet-1.58b/utils_quant.py
+++ b/examples/bitnet-1.58b/utils_quant.py
@@ -24,15 +24,14 @@ def weight_quant(weight, num_bits=1):
 def activation_quant(x, num_bits=8):
     dtype = x.dtype
     x = x.float()
-    Qn = -(2**(num_bits - 1))
-    Qp = 2**(num_bits - 1) - 1
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
     s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
     result = (x * s).round().clamp(Qn, Qp) / s
     return result.type(dtype)
 
 
 class BitLinearBitBLAS(nn.Module):
-
     def __init__(
         self,
         in_features: int,
@@ -68,7 +67,7 @@ def __init__(
         self.bitblas_matmul = self._get_or_create_bitblas_operator(matmul_config, ENABLE_TUNING)
 
         self.format = "bitnet"
-        self.Qp = 2**(self.input_bits - 1) - 1
+        self.Qp = 2 ** (self.input_bits - 1) - 1
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         if global_operator_cache.size() == 0:
@@ -99,8 +98,7 @@ def replace_weight_param_with_qweight(self):
 
     @classmethod
     def from_bit_linear(cls, bitlinear, weight_group=1):
-        bitblas_linear = cls(
-            bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
+        bitblas_linear = cls(bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
         sw, qweight = bitblas_linear.create_bitblas_weights(bitlinear.weight, weight_group)
         bitblas_linear.register_buffer("qweight", qweight)
         bitblas_linear.register_buffer("sw", sw)
@@ -158,8 +156,8 @@ def weight_quant(weight):
     @torch.compile
     def activation_quant(self, x, num_bits=8):
         x = x.float()
-        Qn = -(2**(num_bits - 1))
-        Qp = 2**(num_bits - 1) - 1
+        Qn = -(2 ** (num_bits - 1))
+        Qp = 2 ** (num_bits - 1) - 1
         s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
         result = (x * s).round().clamp(Qn, Qp)
         return result.type(torch.int8), s
@@ -173,9 +171,8 @@ def post_quant_process(self, input, si, sw):
 
     # for the correctness evaluation.
     def native_forward(self, input):
-        quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
-        quant_weight = (
-            self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach())
+        quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
@@ -214,7 +211,6 @@ def forward(self, input):
 
 # Naive BitLinear from HuggingFace
 class BitLinear(nn.Linear):
-
     def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         super(BitLinear, self).__init__(*kargs, **kwargs)
         """
@@ -224,10 +220,8 @@ def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         self.input_bits = input_bits
 
     def forward(self, input):
-
         quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
-        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) -
-                                      self.weight).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
diff --git a/examples/bitnet-1.58b/vllm_workspace/conftest.py b/examples/bitnet-1.58b/vllm_workspace/conftest.py
index 951f38991..e9e2997ef 100644
--- a/examples/bitnet-1.58b/vllm_workspace/conftest.py
+++ b/examples/bitnet-1.58b/vllm_workspace/conftest.py
@@ -20,7 +20,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import TokenizerPoolConfig
-from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel)
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.sequence import SampleLogprobs
@@ -56,12 +56,13 @@ class _ImageAssetsBase(UserList[ImageAsset]):
 
 
 class _ImageAssets(_ImageAssetsBase):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -136,7 +137,6 @@ def image_assets() -> _ImageAssets:
 
 
 class HfRunner:
-
     def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
             return input.to("cuda")
@@ -166,7 +166,8 @@ def __init__(
                 SentenceTransformer(
                     model_name,
                     device="cpu",
-                ).to(dtype=torch_dtype))
+                ).to(dtype=torch_dtype)
+            )
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
@@ -184,7 +185,8 @@ def __init__(
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
                     **model_kwargs,
-                ))
+                )
+            )
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -204,8 +206,7 @@ def __init__(
             )
         except Exception:
             logger.warning(
-                "Unable to auto-load processor from HuggingFace for "
-                "model %s. Using tokenizer instead.",
+                "Unable to auto-load processor from HuggingFace for model %s. Using tokenizer instead.",
                 model_name,
             )
             self.processor = self.tokenizer
@@ -362,7 +363,7 @@ def generate_greedy_logprobs_limit(
                     last_hidden_states,
                     self.model.get_output_embeddings().weight.t(),
                 )
-                if (getattr(self.model.get_output_embeddings(), "bias", None) is not None):
+                if getattr(self.model.get_output_embeddings(), "bias", None) is not None:
                     logits += self.model.get_output_embeddings().bias.unsqueeze(0)
                 logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
@@ -389,8 +390,7 @@ def generate_greedy_logprobs_limit(
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
@@ -409,7 +409,6 @@ def hf_runner():
 
 
 class VllmRunner:
-
     def __init__(
         self,
         model_name: str,
@@ -514,12 +513,10 @@ def generate_greedy_logprobs(
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
-        greedy_logprobs_params = SamplingParams(
-            temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
+        greedy_logprobs_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
         outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params, images=images)
 
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def generate_beam_search(
         self,
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
index 55a24543e..ea18239cb 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
@@ -32,15 +32,14 @@
 
 ckpt_path = args.ckpt_path
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitblas",
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitblas",
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
-                                                   max_tokens=1024)
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024)
     print("bitnet inference:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
index 4f5f87f6f..f631fb306 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
@@ -33,13 +33,13 @@
 ckpt_path = args.ckpt_path
 
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitnet_bitblas",
-        gpu_memory_utilization=0.5,
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitnet_bitblas",
+    gpu_memory_utilization=0.5,
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
     bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
     print("bitnet inference output:")
diff --git a/examples/bitnet-1.58b/vllm_workspace/utils.py b/examples/bitnet-1.58b/vllm_workspace/utils.py
index daa9d8f52..e96b19e28 100644
--- a/examples/bitnet-1.58b/vllm_workspace/utils.py
+++ b/examples/bitnet-1.58b/vllm_workspace/utils.py
@@ -3,8 +3,7 @@
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText],
-                        name_0: str, name_1: str):
+def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText], name_0: str, name_1: str):
     """
     Compare the two sequences generated by different models,
     which should be equal.
@@ -15,19 +14,14 @@ def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[Tok
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_str_0 == output_str_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+        assert output_ids_0 == output_ids_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
 
 TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs], outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
@@ -41,16 +35,11 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
 
         # Loop through generated tokens.
         for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_0 in logprobs_1[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                assert output_id_1 in logprobs_0[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/examples/blocksparse_attention/README.md b/examples/blocksparse_attention/README.md
index 89f75b81d..34bf3c637 100644
--- a/examples/blocksparse_attention/README.md
+++ b/examples/blocksparse_attention/README.md
@@ -1,6 +1,5 @@
 # Block-Sparse Flash-Attention
 
-Tilelang implementation of block-sparse flash-attention kernels. 
-
-The kernels have been used in [Rectified Sparse Attention](https://arxiv.org/abs/2506.04108) and [SeerAttention-R](https://arxiv.org/abs/2506.08889). 
+Tilelang implementation of block-sparse flash-attention kernels.
 
+The kernels have been used in [Rectified Sparse Attention](https://arxiv.org/abs/2506.04108) and [SeerAttention-R](https://arxiv.org/abs/2506.08889).
diff --git a/examples/blocksparse_attention/block_sparse_attn_triton.py b/examples/blocksparse_attention/block_sparse_attn_triton.py
index 014f0c5fc..b94e602f6 100644
--- a/examples/blocksparse_attention/block_sparse_attn_triton.py
+++ b/examples/blocksparse_attention/block_sparse_attn_triton.py
@@ -1,7 +1,6 @@
 # ruff: noqa: E712
 import math
 import torch
-
 import triton
 import triton.language as tl
 import torch.nn.functional as F
@@ -15,10 +14,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +52,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
     # print
 
@@ -73,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -154,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -192,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -254,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -278,9 +259,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -288,9 +269,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -302,22 +281,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -329,9 +307,9 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
@@ -339,8 +317,7 @@ def test_topk_sparse_attention_qlt_kl():
     print("downsample_factor", downsample_factor)
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
     print("downsample_len", downsample_len)
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -351,26 +328,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
index 7e90db7e5..9a394710f 100644
--- a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
+++ b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
@@ -1,8 +1,8 @@
 import math
 import torch
-
 import tilelang
 import tilelang.language as T
+from tilelang.profiler import do_bench
 import torch.nn.functional as F
 
 
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,105 +27,34 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 1
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
-        @T.macro
-        def MMA0(
-            K: T.Tensor(shape, dtype),
-            Q_shared: T.SharedBuffer([block_M, dim], dtype),
-            K_shared: T.SharedBuffer([block_N, dim], dtype),
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            k: T.int32,
-            bx: T.int32,
-            by: T.int32,
-            bz: T.int32,
-        ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-            if is_causal:
-                for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
-            else:
-                T.clear(acc_s)
-            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-        @T.macro
-        def MMA1(
-            V: T.Tensor(shape, dtype),
-            V_shared: T.SharedBuffer([block_M, dim], dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            k: T.int32,
-            by: T.int32,
-            bz: T.int32,
-        ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-            T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-        @T.macro
-        def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            T.copy(scores_max, scores_max_prev)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-            # in the first ceil_div(kBlockM, kBlockN) steps.
-            # for i in T.Parallel(block_M):
-            #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-            for i in T.Parallel(block_M):
-                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-            for i, j in T.Parallel(block_M, block_N):
-                # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-                # max * log_2(e)) This allows the compiler to use the ffma
-                # instruction instead of fadd and fmul separately.
-                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-            T.reduce_sum(acc_s, scores_sum, dim=1)
-            for i in T.Parallel(block_M):
-                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-            T.copy(acc_s, acc_s_cast)
-
-        @T.macro
-        def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] *= scores_scale[i]
-
         @T.prim_func
         def blocksparse_flashattn(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -141,31 +67,59 @@ def blocksparse_flashattn(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
-                        Rescale(acc_o, scores_scale)
-                        MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                        if is_causal:
+                            for i, j in T.Parallel(block_M, block_N):
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                        else:
+                            T.clear(acc_s)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        T.copy(scores_max, scores_max_prev)
+                        T.fill(scores_max, -T.infinity(accum_dtype))
+                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                        # in the first ceil_div(kBlockM, kBlockN) steps.
+                        # for i in T.Parallel(block_M):
+                        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                        for i in T.Parallel(block_M):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                            # max * log_2(e)) This allows the compiler to use the ffma
+                            # instruction instead of fadd and fmul separately.
+                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                        T.reduce_sum(acc_s, scores_sum, dim=1)
+                        for i in T.Parallel(block_M):
+                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                        T.copy(acc_s, acc_s_cast)
+
+                        for i, j in T.Parallel(block_M, dim):
+                            acc_o[i, j] *= scores_scale[i]
+
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return blocksparse_flashattn
 
@@ -180,18 +134,16 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
@@ -202,15 +154,15 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
@@ -224,5 +176,26 @@ def main():
     test_topk_sparse_attention()
 
 
+def run_regression_perf():
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = 1, 32, 256, 64
+    TOPK = 2
+    BLOCK = 64
+    torch.manual_seed(0)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+
+    def run_kernel_only():
+        kernel(q, k, v, block_mask)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index e29982162..a93e4de13 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -2,209 +2,172 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
 import time
 import math
+from tilelang.profiler import do_bench
 
 from heuristic import num_splits_heuristic
 
 
-def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, page_block_size, num_stages, threads, num_pages):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1], pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages,
-                    max_num_blocks_per_seq, max_selected_blocks):
-        shape_q = [batch, heads, dim]
-        shape_k = [num_pages, page_block_size, heads_kv, dim]
-        shape_v = [num_pages, page_block_size, heads_kv, dim_v]
-        shape_indices = [batch, heads_kv, max_selected_blocks]
-        shape_block_table = [batch, max_num_blocks_per_seq]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-        assert block_N <= page_block_size and page_block_size % block_N == 0
-        block_ratio = page_block_size // block_N
-
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-        ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-
-                num_blocks = max_selected_blocks
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    logical_block_idx = block_indices[bid, cur_kv_head, start + k]
-                    if logical_block_idx >= 0:
-                        has_valid_block = True
-                        block_table_idx = T.floordiv(logical_block_idx, block_ratio)
-                        block_tile_idx = T.floormod(logical_block_idx, block_ratio)
-                        physical_block_idx = block_table[bid, block_table_idx]
-                        T.copy(
-                            K[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], K_shared)
-                        T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-                        if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
-                            for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    logical_block_idx * block_N + j >= cache_seqlens[bid],
-                                    -T.infinity(accum_dtype), acc_s[i, j])
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+    num_split = T.dynamic("num_split")
+    max_num_blocks_per_seq = T.dynamic("max_num_blocks_per_seq")
+    max_selected_blocks = T.dynamic("max_selected_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [num_pages, page_block_size, heads_kv, dim]
+    shape_v = [num_pages, page_block_size, heads_kv, dim_v]
+    shape_indices = [batch, heads_kv, max_selected_blocks]
+    shape_block_table = [batch, max_num_blocks_per_seq]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+    assert block_N <= page_block_size and page_block_size % block_N == 0
+    block_ratio = page_block_size // block_N
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_indices: T.Tensor(shape_indices, T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        block_table: T.Tensor(shape_block_table, T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
+    ):
+        # flash_attn_split
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            num_blocks = max_selected_blocks
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                logical_block_idx = block_indices[bid, cur_kv_head, start + k]
+                if logical_block_idx >= 0:
+                    has_valid_block = True
+                    block_table_idx = T.floordiv(logical_block_idx, block_ratio)
+                    block_tile_idx = T.floormod(logical_block_idx, block_ratio)
+                    physical_block_idx = block_table[bid, block_table_idx]
+                    T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
-
+                            acc_s[i, j] = T.if_then_else(
+                                logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
+                            )
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
-                    lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
-                        max_split[0] = k
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-
-                for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split[0]:
-                        lse_local_split[0] = glse[bz, by, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-                for k in T.serial(num_split):
-                    if k <= max_split[0]:
-                        for i in T.Parallel(dim_v):
-                            po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split[0] = glse[bz, by, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                        for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
-
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse,
-                             Output_partial)
-            combine(glse, Output_partial, Output)
-
-        return main
-
-    return kernel_func
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
+                    lse_local_split = glse[bz, by, k]
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
+                    for i in T.Parallel(dim_v):
+                        po_local[i] = Output_partial[bz, by, k, i]
+                    lse_local_split = glse[bz, by, k]
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                    for i in T.Parallel(dim_v):
+                        o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
+
+    print(main)
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -216,19 +179,6 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
         self.page_block_size = page_block_size
         self.num_pages = num_pages
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_N,
-            block_H=self.block_H,
-            page_block_size=page_block_size,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            num_pages=num_pages,
-            max_num_blocks_per_seq=T.dynamic("max_num_blocks_per_seq"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"),
-        )
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -250,40 +200,35 @@ def forward(self, query, key, value, block_indices, cache_seqlens, block_table):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
-
-        output = self.kernel(
-            query,
-            key,
-            value,
-            block_indices,
-            cache_seqlens,
-            block_table,
-            glse,
-            output_partial,
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
         )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+
+        output = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            page_block_size=self.page_block_size,
+            num_stages=2,
+            threads=128,
+            num_pages=self.num_pages,
+        )(query, key, value, block_indices, cache_seqlens, block_table, glse, output_partial)
         return output
 
 
-def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens,
-                            block_table, page_block_size, block_size):
+def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens, block_table, page_block_size, block_size):
     """
     Paged version of sparse attention reference implementation.
-    
+
     Args:
         query: [batch, heads, dim]
-        key_cache: [num_pages, page_block_size, heads_kv, dim] 
+        key_cache: [num_pages, page_block_size, heads_kv, dim]
         value_cache: [num_pages, page_block_size, heads_kv, dim]
         block_indices: [batch, heads_kv, max_selected_blocks] - logical block indices
         cache_seqlens: [batch] - actual sequence lengths
@@ -299,12 +244,8 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 
     # Reconstruct the full key and value tensors from paged cache
     max_cache_seqlen = max(cache_seqlens).item()
-    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim),
-                           dtype=key_cache.dtype,
-                           device=key_cache.device)
-    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v),
-                             dtype=value_cache.dtype,
-                             device=value_cache.device)
+    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim), dtype=key_cache.dtype, device=key_cache.device)
+    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v), dtype=value_cache.dtype, device=value_cache.device)
 
     # Reconstruct full tensors from paged cache using block_table
     for b in range(batch):
@@ -320,20 +261,14 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
             actual_block_size = end_token - start_token
 
             # Copy from paged cache to full tensors
-            key_full[b, :, start_token:end_token, :] = key_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
-            value_full[b, :, start_token:end_token, :] = value_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            key_full[b, :, start_token:end_token, :] = key_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            value_full[b, :, start_token:end_token, :] = value_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
 
     # Reshape query for grouped attention
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Compute attention scores
-    scores = einsum(
-        query, key_full,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key_full, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     # Create sparse mask based on block_indices
     sparse_mask = torch.zeros_like(scores)
@@ -349,24 +284,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
                     sparse_mask[b, :, h, start_pos:end_pos] = 1
 
     # Apply sparse mask
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
     # Apply causal mask based on actual sequence lengths
     range_len = torch.arange(scores.shape[-1], device=scores.device).unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
+    scores = scores.masked_fill(pad_mask, float("-inf"))
 
     # Compute attention weights
     attention = F.softmax(scores / scale, dim=-1)
 
     # Apply attention to values
-    out = einsum(attention, value_full,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
+    out = einsum(attention, value_full, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Reshape output back to original format
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
 
     return out
 
@@ -374,17 +308,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 def ref_program_fa(query, kcache, vcache, cache_seqlens, block_table):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
-    output = flash_attn_with_kvcache(
-        query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
+    output = flash_attn_with_kvcache(query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
     output = output.squeeze(1)
     return output
 
 
 def main(args):
-
-    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
     sparse_ratio = args.sparse_ratio
     block_N = args.block_N
     page_block_size = args.page_block_size
@@ -396,35 +336,30 @@ def main(args):
     dtype = torch.float16
 
     # Generate random inputs
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(
-        max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
     print("cache_seqlens: ", cache_seqlens)
 
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create paged KV cache
-    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device='cuda')
-    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v),
-                          dtype=dtype,
-                          device='cuda')
+    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create block table and block indices for dense case (all blocks selected)
     max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
     print("max_num_blocks_per_seq: ", max_num_blocks_per_seq)
-    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device='cuda')
-    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks),
-                                dtype=torch.int32,
-                                device='cuda')
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
 
     # Fill block table and block indices and cache
 
     # Create a pool of available physical blocks
-    total_blocks_needed = sum(
-        int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
     available_blocks = list(range(total_blocks_needed))
     import random
+
     random.seed(42)  # For reproducibility
     random.shuffle(available_blocks)
 
@@ -459,10 +394,8 @@ def main(args):
             actual_block_size = end_token - start_token
 
             # Copy K and V data to the paged cache
-            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx,
-                                                                      start_token:end_token, :, :]
-            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx,
-                                                                      start_token:end_token, :, :]
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
 
     # Fill block_indices for sparse attention
     # For dense case (verification), we select all blocks in reverse order
@@ -497,10 +430,9 @@ def main(args):
                     remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
                     if remaining_blocks:
                         import random
+
                         random.seed(42)  # For reproducibility
-                        additional_blocks = random.sample(
-                            remaining_blocks,
-                            min(num_selected - recent_blocks, len(remaining_blocks)))
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
                         selected_blocks.extend(additional_blocks)
 
                 # Sort selected blocks in reverse order (most recent first)
@@ -513,25 +445,20 @@ def main(args):
                     block_indices[seq_idx, head_idx, i] = -1
 
     # Initialize sparse attention module
-    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
-                                  num_blocks)
-    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                        block_table)
+    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
+    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table)
 
     import flash_attn  # noqa: F401
 
-    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                               block_table, page_block_size, block_N)
+    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, page_block_size, block_N)
 
     output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
     # Check correctness
     if sparse_ratio == 0.0:
         max_diff = torch.max(torch.abs(output_sparse - output_ref_fa)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_fa)).item()
-        assert torch.allclose(
-            output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
+        assert torch.allclose(output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
     else:
-
         max_diff = torch.max(torch.abs(output_sparse - output_ref_torch)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_torch)).item()
 
@@ -573,18 +500,144 @@ def main(args):
     print(f"Speedup: {kernel_time_fa / kernel_time:.2f}x")
 
 
+def run_regression_perf(args):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
+    sparse_ratio = args.sparse_ratio
+    block_N = args.block_N
+    page_block_size = args.page_block_size
+    num_pages = args.num_pages
+    max_selected_blocks = int(math.ceil(max_cache_seqlen / block_N))
+    dtype = torch.float16
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    K_cache = torch.zeros((num_pages, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_pages, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
+    max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    available_blocks = list(range(total_blocks_needed))
+    import random
+
+    random.seed(42)
+    random.shuffle(available_blocks)
+    block_assignment = {}
+    block_idx_counter = 0
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_blocks_needed = int(math.ceil(seq_len / page_block_size))
+        for block_idx in range(num_blocks_needed):
+            physical_block_idx = available_blocks[block_idx_counter]
+            block_table[seq_idx, block_idx] = physical_block_idx
+            block_assignment[(seq_idx, block_idx)] = physical_block_idx
+            block_idx_counter += 1
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_blocks_needed = int(math.ceil(seq_len / page_block_size))
+        for block_idx in range(num_blocks_needed):
+            physical_block_idx = block_assignment[(seq_idx, block_idx)]
+            start_token = block_idx * page_block_size
+            end_token = min(start_token + page_block_size, seq_len)
+            actual_block_size = end_token - start_token
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_tile = int(math.ceil(seq_len / block_N))
+        if sparse_ratio == 0.0:
+            selected_blocks = min(num_tile, max_selected_blocks)
+            for head_idx in range(heads_kv):
+                for i in range(selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = num_tile - 1 - i
+                for i in range(selected_blocks, max_selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = -1
+        else:
+            num_selected = int(num_tile * (1.0 - sparse_ratio))
+            num_selected = max(1, min(num_selected, max_selected_blocks))
+            all_blocks = list(range(num_tile))
+            for head_idx in range(heads_kv):
+                selected_blocks = []
+                recent_blocks = 1
+                selected_blocks.append(num_tile - 1)
+                if num_selected > recent_blocks:
+                    remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
+                    if remaining_blocks:
+                        import random
+
+                        random.seed(42)
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
+                        selected_blocks.extend(additional_blocks)
+
+                selected_blocks.sort(reverse=True)
+
+                for i in range(len(selected_blocks)):
+                    block_indices[seq_idx, head_idx, i] = selected_blocks[i]
+                for i in range(len(selected_blocks), max_selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = -1
+
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_N
+    max_selected_blocks = block_indices.shape[-1]
+
+    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
+    num_n_blocks = max_selected_blocks
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
+    total_mblocks = batch * heads_kv * num_m_blocks
+    num_sm = sparse_kernel.num_sm
+
+    num_split = num_splits_heuristic(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
+        block_N=block_size,
+        block_H=sparse_kernel.block_H,
+        page_block_size=sparse_kernel.page_block_size,
+        num_stages=2,
+        threads=128,
+        num_pages=sparse_kernel.num_pages,
+    )
+
+    def run_kernel_only():
+        kernel(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, glse, output_partial)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.0, help='sparse ratio')
-    parser.add_argument('--block_N', type=int, default=64, help='block_N')
-    parser.add_argument('--page_block_size', type=int, default=256, help='block size of pages')
-    parser.add_argument('--num_pages', type=int, default=1024, help='total number of pages')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.0, help="sparse ratio")
+    parser.add_argument("--block_N", type=int, default=64, help="block_N")
+    parser.add_argument("--page_block_size", type=int, default=256, help="block size of pages")
+    parser.add_argument("--num_pages", type=int, default=1024, help="total number of pages")
     args = parser.parse_args()
     main(args)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index ae3004267..54148e69b 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -7,191 +7,156 @@
 import time
 import math
 from heuristic import num_splits_heuristic
-
-
-def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1], pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen,
-                    max_selected_blocks):
-        shape_q = [batch, heads, dim]
-        shape_k = [batch, max_cache_seqlen, heads_kv, dim]
-        shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
-        shape_indices = [batch, heads_kv, max_selected_blocks]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-        ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                # O_shared = T.alloc_shared([valid_block_H, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-
-                num_blocks = max_selected_blocks
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    i_s = block_indices[bid, cur_kv_head, start + k]
-                    if i_s >= 0:
-                        has_valid_block = True
-                        T.copy(K[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], K_shared)
-                        T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-                        if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
-                            for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i,
-                                      j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid],
-                                                          -T.infinity(accum_dtype), acc_s[i, j])
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+    num_split = T.dynamic("num_split")
+    max_cache_seqlen = T.dynamic("max_cache_seqlen")
+    max_selected_blocks = T.dynamic("max_selected_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [batch, max_cache_seqlen, heads_kv, dim]
+    shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
+    shape_indices = [batch, heads_kv, max_selected_blocks]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_indices: T.Tensor(shape_indices, T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        # actual_num_blocks: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            num_blocks = max_selected_blocks
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                i_s = block_indices[bid, cur_kv_head, start + k]
+                if i_s >= 0:
+                    has_valid_block = True
+                    T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
-
+                            acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
-                    lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
-                        max_split[0] = k
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-
-                for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split[0]:
-                        lse_local_split[0] = glse[bz, by, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-                for k in T.serial(num_split):
-                    if k <= max_split[0]:
-                        for i in T.Parallel(dim_v):
-                            po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split[0] = glse[bz, by, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                        for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
-
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, glse, Output_partial)
-            combine(glse, Output_partial, Output)
-
-        return main
-
-    return kernel_func
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
+                    lse_local_split = glse[bz, by, k]
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
+                    for i in T.Parallel(dim_v):
+                        po_local[i] = Output_partial[bz, by, k, i]
+                    lse_local_split = glse[bz, by, k]
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                    for i in T.Parallel(dim_v):
+                        o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
+
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -200,18 +165,7 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         self.dim = dim
         self.dim_v = dim_v
         self.block_size = block_size
-
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_size,
-            block_H=self.block_H,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"))
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -233,25 +187,27 @@ def forward(self, query, key, value, block_indices, cache_seqlens):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
-
-        output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+
+        output = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            num_stages=2,
+            threads=128,
+        )(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
 
-def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens,
-                                    max_cache_seqlen, block_size):
+def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, block_size):
     """
     Args:
         query: [batch, heads, dim]
@@ -273,61 +229,51 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
     block_H = 64
 
     actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
 
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
-
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
-    kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
         block_N=block_size,
         block_H=block_H,
-        num_split=T.dynamic("num_split"),
         num_stages=2,
         threads=128,
-        max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        max_selected_blocks=T.dynamic("max_selected_blocks"))
+    )
 
     output = kernel(query, key, value, block_indices, cache_seqlens, glse, Output_partial)
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -336,149 +282,141 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
+def assert_close(name, expect, actual, atol=1e-3, rtol=1e-3):
     all_close = torch.allclose(expect, actual, atol=atol, rtol=rtol)
     print(name + "  all_close={}".format(all_close))
     if not all_close:
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    dtype = torch.float16
     sparse_ratio = sparse_ratio
     block_size = block_size
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
-    print("max_selected_blocks: ", max_selected_blocks)
-    dtype = torch.float16
-
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
-    # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
-    # # Ensure at least one element equals cache_seqlen
-    # random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    # # cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
-
-    print("cache_seqlens: ", cache_seqlens)
 
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
-    print("max_valid_num_blocks: ", max_valid_num_blocks)
-    # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
-    # max_num_blocks = int((max_cache_seqlen + block_size - 1)/ block_size)
-    # block_indices = torch.full((batch, heads_kv, max_num_blocks), -1, dtype=torch.int32, device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
-                # valid_indices = torch.randperm(max_valid_block, device='cuda', dtype=torch.int32)[:max_num_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
-    # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
-    # print("block_indices: ", block_indices)
-    actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)[:, 0]
-    print("actual_num_blocks: ", actual_num_blocks)
-    # print(block_indices.shape, actual_num_blocks.shape)
-
     max_num_blocks = torch.max(max_valid_num_blocks).item()
-    print("max_num_blocks: ", max_num_blocks)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
-    debug("output", ref, out, atol=1e-3, rtol=1e-3)
-
-    import flash_attn  # noqa: F401
+    assert_close("output", ref, out, atol=1e-3, rtol=1e-3)
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
     for _ in range(10):
-        # out = sparse_gqa_decode_varlen_indice(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, block_size)
         out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        # out = sparse_gqa_decode_varlen_indice(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, block_size)
         out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
     torch.cuda.synchronize()
     print("sparse time: ", (time.time() - start) / 100 * 1000)
 
 
+def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    sparse_ratio = sparse_ratio
+    block_size = block_size
+    max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
+    dtype = torch.float16
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
+    max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
+
+    for b in range(batch):
+        max_valid_block = max_valid_num_blocks[b].item()
+        if max_valid_block > 0:
+            for h in range(heads_kv):
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
+
+    block_indices, _ = block_indices.sort(dim=-1, descending=True)
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_size
+
+    def run_kernel_only():
+        sparse_kernel(Q, K, V, block_indices, cache_seqlens)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index ad62817dd..e588ec54c 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -1,184 +1,156 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
-
 import time
 import math
 from heuristic import num_splits_heuristic
-
-
-def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1], pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, num_blocks):
-        shape_q = [batch, heads, dim]
-        shape_k = [batch, max_cache_seqlen, heads_kv, dim]
-        shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
-        shape_mask = [batch, heads_kv, num_blocks]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-        ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                # O_shared = T.alloc_shared([valid_block_H, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    if block_mask[bid, hid, start + k]:
-                        has_valid_block = True
-                        T.copy(
-                            K[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            K_shared)
-                        T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-                        for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.if_then_else((start + k) * block_N + j
-                                                         >= cache_seqlens[bx],
-                                                         -T.infinity(accum_dtype), acc_s[i, j])
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
-                        for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
+    num_split = T.dynamic("num_split")
+    max_cache_seqlen = T.dynamic("max_cache_seqlen")
+    num_blocks = T.dynamic("num_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [batch, max_cache_seqlen, heads_kv, dim]
+    shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
+    shape_mask = [batch, heads_kv, num_blocks]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_mask: T.Tensor(shape_mask, T.bool),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                if block_mask[bid, hid, start + k]:
+                    has_valid_block = True
+                    T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.if_then_else((start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
-                    lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-                for k in T.Pipelined(num_split, num_stages=1):
-                    lse_local_split[0] = glse[bz, by, k]
-                    lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-                for k in T.serial(num_split):
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
+                    lse_local_split = glse[bz, by, k]
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
                     for i in T.Parallel(dim_v):
                         po_local[i] = Output_partial[bz, by, k, i]
-                    lse_local_split[0] = glse[bz, by, k]
-                    scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                    lse_local_split = glse[bz, by, k]
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
                     for i in T.Parallel(dim_v):
-                        o_accum_local[i] += po_local[i] * scale_local[0]
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
+                        o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
 
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            flash_attn_split(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
-            combine(glse, Output_partial, Output)
-
-        return main
-
-    return kernel_func
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -187,18 +159,7 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         self.dim = dim
         self.dim_v = dim_v
         self.block_size = block_size
-
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_size,
-            block_H=self.block_H,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            num_blocks=T.dynamic("num_blocks"))
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -209,32 +170,33 @@ def forward(self, query, key, value, block_mask, cache_seqlens):
         dim_v = self.dim_v
         dim = self.dim
         block_size = self.block_size
-        block_H = self.block_H
         max_cache_seqlen = key.shape[1]
         # get num_split
         max_selected_blocks = (max_cache_seqlen + block_size - 1) // block_size
-        num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
+        num_m_blocks = 1 * (heads // heads_kv + self.block_H - 1) // self.block_H
         num_n_blocks = max_selected_blocks
 
-        size_one_kv_head = max_selected_blocks * block_size * (
-            dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+        size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
         total_mblocks = batch * heads_kv * num_m_blocks
-        # num_sm = 132
         num_sm = self.num_sm
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-        # print("num_split: ", num_split)
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
-        output = self.kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+
+        output = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            num_stages=2,
+            threads=128,
+        )(query, key, value, block_mask, cache_seqlens, glse, output_partial)
         return output
 
 
@@ -258,64 +220,52 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     block_H = 64
 
     actual_num_blocks = torch.sum(block_mask, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
     max_selected_blocks = actual_num_blocks.max().item()
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
-
-    kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
         block_N=block_size,
         block_H=block_H,
-        num_split=T.dynamic("num_split"),
         num_stages=2,
         threads=128,
-        max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        num_blocks=T.dynamic("num_blocks"))
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
-    # print(kernel.get_kernel_source())
+    )
 
     output = kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
-
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -323,59 +273,45 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
+def assert_close(name, expect, actual, atol=1e-3, rtol=1e-3):
     all_close = torch.allclose(expect, actual, atol=atol, rtol=rtol)
     print(name + "  all_close={}".format(all_close))
     if not all_close:
-        # print(expect[3, 28])
-        # print(actual[3, 28])
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -383,14 +319,13 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
 
     print("cache_seqlens: ", cache_seqlens)
@@ -402,7 +337,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -410,29 +345,26 @@ def main(batch=8,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
     # print("block_mask: ", block_mask)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     # out = sparse_gqa_decode_varlen_mask(Q, K, V, block_mask, cache_seqlens, block_size)
     model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = model(Q, K, V, block_mask, cache_seqlens)
-    debug("output", ref, out, atol=1e-3, rtol=1e-3)
+    assert_close("output", ref, out, atol=1e-3, rtol=1e-3)
 
     import flash_attn  # noqa: F401
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -449,17 +381,83 @@ def main(batch=8,
     print("sparse time: ", (time.time() - start) / 100 * 1000)
 
 
+def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    sparse_ratio = sparse_ratio
+    block_size = block_size
+    max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
+    dtype = torch.float16
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()
+    cache_seqlens[random_index] = max_cache_seqlen
+
+    num_blocks = (max_cache_seqlen + block_size - 1) // block_size
+
+    valid_num_blocks = torch.ceil(cache_seqlens * (1 - sparse_ratio) / block_size).int()
+    max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
+
+    for b in range(batch):
+        max_valid_block = max_valid_num_blocks[b].item()
+        valid_num_block = valid_num_blocks[b].item()
+        if valid_num_block > 0:
+            for h in range(heads_kv):
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
+                block_mask[b, h, perm] = True
+
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_size
+    max_selected_blocks = (max_cache_seqlen + block_size - 1) // block_size
+
+    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
+    num_n_blocks = max_selected_blocks
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
+    total_mblocks = batch * heads_kv * num_m_blocks
+    num_sm = sparse_kernel.num_sm
+
+    num_split = num_splits_heuristic(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
+        block_N=block_size,
+        block_H=sparse_kernel.block_H,
+        num_stages=2,
+        threads=128,
+    )
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_mask, cache_seqlens, glse, output_partial)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index 85b72b775..91d85a1a4 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -5,19 +5,15 @@
 import argparse
 from einops import rearrange, einsum
 import torch.nn.functional as F
-
 import math
 import time
 from heuristic import num_splits_heuristic
+from tilelang.profiler import do_bench
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -79,16 +75,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for i in range(loop_range):
         block_idx = tl.load(mask_ptr + (start + i) * stride_mask_s)
@@ -119,23 +110,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -163,18 +149,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -209,19 +192,13 @@ def block_sparse_flash_decode_gqa_indice_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -295,24 +272,18 @@ def block_sparse_flash_decode_gqa_indice_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     dim_v = value.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -321,42 +292,33 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -367,49 +329,31 @@ def main(batch=64,
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
-    block_H = 64
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
-    # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
-
-    print("cache_seqlens: ", cache_seqlens)
-
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
-    print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
-    # print("block_indices: ", block_indices)
-    actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)[:, 0]
-    print("actual_num_blocks: ", actual_num_blocks)
-    # print(block_indices.shape, actual_num_blocks.shape)
-
     max_num_blocks = torch.max(max_valid_num_blocks).item()
-    print("max_num_blocks: ", max_num_blocks)
 
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_indice_triton(
         Q,
@@ -423,8 +367,7 @@ def main(batch=64,
     )
 
     print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -447,6 +390,7 @@ def main(batch=64,
     avg_time = elapsed_time / 1000
     avg_flops = total_flops / avg_time
     print(f"Average time: {avg_time:.6f} seconds")
+    print(f"Average FLOPS: {avg_flops:.2f} GFLOPS")
 
     # Measure performance of reference implementation
     import flash_attn  # noqa: F401
@@ -460,21 +404,19 @@ def main(batch=64,
     avg_time_ref = elapsed_time_ref / 1000
     avg_flops_ref = total_flops / avg_time_ref
     print(f"Average time of ref: {avg_time_ref:.6f} seconds")
-
+    print(f"Average FLOPS of ref: {avg_flops_ref:.2f} GFLOPS")
     print(f"Speedup: {avg_time_ref / avg_time:.2f}x")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
index 348572526..232bcacaf 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
@@ -4,19 +4,14 @@
 import argparse
 from einops import rearrange, einsum
 import torch.nn.functional as F
-
 import math
 import time
 from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -77,16 +72,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for block_idx in range(loop_range):
         start_n = (start + block_idx) * BLOCK_N
@@ -117,23 +107,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -161,18 +146,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -207,19 +189,13 @@ def block_sparse_flash_decode_gqa_mask_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -292,24 +268,18 @@ def block_sparse_flash_decode_gqa_mask_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -317,43 +287,34 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     block_size = block_size
     sparse_ratio = sparse_ratio
@@ -363,14 +324,13 @@ def main(batch=64,
 
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     num_blocks = (max_cache_seqlen + block_size - 1) // block_size
 
@@ -379,7 +339,7 @@ def main(batch=64,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -387,11 +347,10 @@ def main(batch=64,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
 
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_mask_triton(
         Q,
@@ -404,8 +363,7 @@ def main(batch=64,
     )
 
     # print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -448,15 +406,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/heuristic.py b/examples/blocksparse_attention/heuristic.py
index b60a81dc3..0e6fc5281 100644
--- a/examples/blocksparse_attention/heuristic.py
+++ b/examples/blocksparse_attention/heuristic.py
@@ -1,8 +1,7 @@
 import math
 
 
-def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head,
-                         is_causal_or_local, max_splits):
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local, max_splits):
     """
     Determines the optimal number of splits for maximizing GPU occupancy while balancing memory efficiency.
 
diff --git a/examples/blocksparse_attention/regression_example_blocksparse_attention.py b/examples/blocksparse_attention/regression_example_blocksparse_attention.py
new file mode 100644
index 000000000..26fa60df5
--- /dev/null
+++ b/examples/blocksparse_attention/regression_example_blocksparse_attention.py
@@ -0,0 +1,20 @@
+import tilelang.testing
+import example_tilelang_block_sparse_attn
+import example_tilelang_sparse_gqa_decode_varlen_indice
+import example_tilelang_sparse_gqa_decode_varlen_mask
+
+
+def regression_example_tilelang_block_sparse_attn():
+    tilelang.testing.process_func(example_tilelang_block_sparse_attn.run_regression_perf)
+
+
+def regression_example_tilelang_sparse_gqa_decode_varlen_indice():
+    tilelang.testing.process_func(example_tilelang_sparse_gqa_decode_varlen_indice.run_regression_perf, batch=1, max_cache_seqlen=2048)
+
+
+def regression_example_tilelang_sparse_gqa_decode_varlen_mask():
+    tilelang.testing.process_func(example_tilelang_sparse_gqa_decode_varlen_mask.run_regression_perf, batch=1, max_cache_seqlen=2048)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/blocksparse_attention/test_example_blocksparse_attention.py b/examples/blocksparse_attention/test_example_blocksparse_attention.py
index adda1f0f1..dd33f46c4 100644
--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
@@ -25,26 +25,14 @@ def test_example_tilelang_sparse_gqa_decode_varlen_mask():
 
 def test_example_triton_sparse_gqa_decode_varlen_indice():
     example_triton_sparse_gqa_decode_varlen_indice.main(
-        batch=8,
-        heads=8,
-        heads_kv=4,
-        max_cache_seqlen=2048,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=8, heads=8, heads_kv=4, max_cache_seqlen=2048, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 def test_example_triton_sparse_gqa_decode_varlen_mask():
     example_triton_sparse_gqa_decode_varlen_mask.main(
-        batch=16,
-        heads=16,
-        heads_kv=8,
-        max_cache_seqlen=1024,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=16, heads=16, heads_kv=8, max_cache_seqlen=1024, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 7b9cff7c1..289421548 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -2,10 +2,9 @@
 import itertools
 import tilelang
 import tilelang.language as T
-from tilelang.engine.param import KernelParam
 from tilelang.utils.tensor import get_tensor_supply, TensorSupplyType
 import torch
-from typing import List
+from tilelang.profiler import do_bench
 
 DEFAULT_BLOCK_M = 128
 DEFAULT_BLOCK_N = 128
@@ -13,25 +12,8 @@
 DEFAULT_NUM_STAGES = 2
 DEFAULT_THREAD_NUM = 128
 DEFAULT_ENABLE_RASTERIZATION = True
-
-parser = argparse.ArgumentParser(description="Autotuned BlockSparse MatMul Benchmark")
-parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
-parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
-parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
-parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
-parser.add_argument(
-    "--use_autotune", action="store_true", default=False, help="Whether to use autotune")
-
-args, _ = parser.parse_known_args()
-M, N, K = args.m, args.n, args.k
-sparsity = args.sparsity
-use_autotune = args.use_autotune
 default_tensor_supply = get_tensor_supply(TensorSupplyType.Auto)
 
-print(f"Running BlockSparse MatMul Benchmark for M={M}, N={N}, K={K}")
-print(f"Target Block Sparsity: {sparsity}")
-print(f"Using Autotuner: {use_autotune}\n")
-
 
 def get_configs():
     block_M = [64, 128, 256]
@@ -41,76 +23,52 @@ def get_configs():
     thread_num = [128, 256]
     enable_rasterization = [True, False]
 
-    _configs = list(
-        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
+    _configs = list(itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
 
-    return [{
-        "block_M": c[0],
-        "block_N": c[1],
-        "block_K": c[2],
-        "num_stages": c[3],
-        "thread_num": c[4],
-        "enable_rasteration": c[5],
-    } for c in _configs]
+    return [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "enable_rasteration": c[5],
+        }
+        for c in _configs
+    ]
 
 
 def ref_program(A, B, BlockMask, block_M, block_N, block_K):
+    M, K = A.shape
+    _, N = B.shape
     ref_c = torch.zeros((M, N), dtype=torch.float16, device=A.device)
     for i in range(M // block_M):
         for j in range(N // block_N):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if BlockMask[i, j, k]:
-                    accu += (
-                        A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                            torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                               j * block_N:(j + 1) * block_N].to(torch.float32))
-            ref_c[i * block_M:(i + 1) * block_M,
-                  j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
-def supply_program(params: List[KernelParam]):
-    input_tensors = []
-
-    for p in params:
-        # Check if the kernel parameter is BlockMask tensor.
-        # Here, BlockMask is uniquely identified by having 3 dimensions.
-        if len(p.shape) != 3:
-            # For non-BlockMask tensors, use the default tensor generation logic.
-            input_tensors.append(default_tensor_supply(p))
-        else:
-            # For BlockMask tensor, randomly set elements to True based on desired
-            # sparsity level.
-            block_mask = torch.zeros(p.shape, dtype=torch.bool, device=torch.cuda.current_device())
-            block_mask[:, :, :] = torch.rand(p.shape) > sparsity
-            input_tensors.append(block_mask)
-
-    return input_tensors
-
-
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       thread_num,
-                       enable_rasteration,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+def blocksparse_matmul(
+    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -134,7 +92,20 @@ def block_sparse_matmul(
 
 
 def main():
-
+    parser = argparse.ArgumentParser(description="Autotuned BlockSparse MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
+    parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
+
+    args, _ = parser.parse_known_args()
+    M, N, K = args.m, args.n, args.k
+    sparsity = args.sparsity
+    use_autotune = args.use_autotune
+    print(f"Running BlockSparse MatMul Benchmark for M={M}, N={N}, K={K}")
+    print(f"Target Block Sparsity: {sparsity}")
+    print(f"Using Autotuner: {use_autotune}\n")
     # Initialize input matrices A and B on the GPU with half precision
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
@@ -147,8 +118,7 @@ def main():
 
         best_config = kernel.config
         best_latency = kernel.latency
-        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config[
-            "block_K"]
+        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config["block_K"]
 
         print(f"Best Config: {best_config}")
         print(f"Sparsity Ratio: {sparsity}")
@@ -163,10 +133,10 @@ def main():
             block_K=DEFAULT_BLOCK_K,
             num_stages=DEFAULT_NUM_STAGES,
             thread_num=DEFAULT_THREAD_NUM,
-            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
+            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+        )
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -185,5 +155,34 @@ def main():
         print(e)
 
 
+def run_regression_perf():
+    M = N = K = 1024
+    sparsity = 0.5
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+
+    kernel = blocksparse_matmul(
+        M,
+        N,
+        K,
+        block_M=DEFAULT_BLOCK_M,
+        block_N=DEFAULT_BLOCK_N,
+        block_K=DEFAULT_BLOCK_K,
+        num_stages=DEFAULT_NUM_STAGES,
+        thread_num=DEFAULT_THREAD_NUM,
+        enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+    )
+    block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
+    mask_shape = (M // block_M, N // block_N, K // block_K)
+    block_mask = torch.rand(mask_shape).cuda() > sparsity
+
+    def run_kernel_only():
+        kernel(a, b, block_mask)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py b/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py
new file mode 100644
index 000000000..81900a00c
--- /dev/null
+++ b/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_blocksparse_gemm
+
+
+def regression_example_blocksparse_gemm():
+    tilelang.testing.process_func(example_blocksparse_gemm.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index 102ac2021..db6beab1e 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -5,8 +5,8 @@
 from tilelang.utils.tensor import torch_assert_close
 
 # support bfloat16, float, float16
-dtype = "bfloat16"
-accum_dtype = "float"
+dtype = T.bfloat16
+accum_dtype = T.float32
 
 
 @tilelang.jit(out_idx=[2, 3])
@@ -16,11 +16,13 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
-            (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
-        with T.Kernel(
-                T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
+    def group_per_split_token_cast(
+        X: T.Tensor((M, N), dtype),
+        batch_sizes: T.Tensor((BG,), T.int32),
+        X_fp8: T.Tensor((BG, M_max, N), T.float8_e4m3fn),
+        X_amax: T.Tensor((BG, M_max, T.ceildiv(N, group_size)), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
             row = bx
             row_g_id = by
             bg = bz
@@ -28,39 +30,29 @@ def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-            row_offset = T.alloc_fragment((1,), "int32")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+            row_offset = T.alloc_var(dtype=T.int32)
 
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
-
-            row_offset[0] = 0
+            row_offset = 0
             for i in T.serial(bg):
-                row_offset[0] += batch_sizes[i]
+                row_offset += batch_sizes[i]
 
             T.copy(
-                X[row_offset[0] + row * blk_m:row_offset[0] + (row + 1) * blk_m,
-                  row_g_id * group_size:(row_g_id + 1) * group_size], y_local)
+                X[row_offset + row * blk_m : row_offset + (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size],
+                y_local,
+            )
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
-                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                              y_amax_local[i] / fp8_max, 0)
+                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_amax_local[i] / fp8_max, 0)
             for i, j in T.Parallel(blk_m, group_size):
                 y_q_local[i, j] = T.clamp(y_local[i, j] / y_s_local[i], fp8_min, fp8_max)
             T.copy(y_q_local, y_q_local_fp8)
             for i, j in T.Parallel(blk_m, group_size):
-                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                                     y_q_local[i, j], 0)
+                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_q_local[i, j], 0)
             for i in T.Parallel(blk_m):
                 X_amax[bg, row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[bg, row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[bg, row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return group_per_split_token_cast
 
@@ -127,8 +119,7 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
         return x.squeeze(0) if remove_dim else x
 
     # Normal layout requires transposing
-    aligned_x = torch.transpose(
-        torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
     aligned_x[:, :m, :] = x
     aligned_x = aligned_x[:, :m, :]
     return aligned_x.squeeze(0) if remove_dim else aligned_x
@@ -146,15 +137,17 @@ def ref_per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
     x_fp8 = x_fp8.view(m, -1)[:, :n].contiguous()
     return x_fp8, (x_amax / 448.0).view(m, -1)
 
-def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
-        Tuple[torch.Tensor, torch.Tensor]:
+
+def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     # assert x.shape[0] == batch_sizes.sum()
     M_max = ceil_div(batch_sizes.max(), 128) * 128
     split_x = torch.split(x, batch_sizes.tolist(), dim=0)
     padded_x = [torch.nn.functional.pad(t, (0, 0, 0, M_max - t.shape[0])) for t in split_x]
     num_groups, m, n = batch_sizes.shape[0], M_max, x.shape[1]
-    x_fp8 = (torch.empty((num_groups, m, n), device='cuda', dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, m, n // 128), device='cuda', dtype=torch.float))
+    x_fp8 = (
+        torch.empty((num_groups, m, n), device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty((num_groups, m, n // 128), device="cuda", dtype=torch.float),
+    )
     for i in range(num_groups):
         x_fp8[0][i], x_fp8[1][i] = ref_per_token_cast_to_fp8(padded_x[i])
     x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1]))
@@ -164,11 +157,11 @@ def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
 def main(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
     if batch_sizes is None:
         batch_sizes = [2048, 6144]
-    if dtype == "float":
+    if dtype == T.float:
         x = torch.randn(M, N, device="cuda", dtype=torch.float32)
-    elif dtype == "float16":
+    elif dtype == T.float16:
         x = torch.randn(M, N, device="cuda", dtype=torch.float16)
-    elif dtype == "bfloat16":
+    elif dtype == T.bfloat16:
         x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
@@ -206,5 +199,35 @@ def run_torch():
     print("Torch: {:.2f} ms".format(latency))
 
 
+def run_regression_perf(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
+    if batch_sizes is None:
+        batch_sizes = [2048, 6144]
+    if dtype == "float":
+        x = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    elif dtype == "float16":
+        x = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    elif dtype == "bfloat16":
+        x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    batch_sizes = torch.tensor(batch_sizes, device="cuda", dtype=torch.int32)
+    M_max = int(ceil_div(batch_sizes.max(), 128) * 128)
+
+    kernel = group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m)
+
+    x_fp8, x_amax = kernel(x, batch_sizes)
+    x_fp8_ref, x_amax_ref = ref_program(x, batch_sizes)
+
+    torch_assert_close(x_fp8.to(torch.float32), x_fp8_ref.to(torch.float32), rtol=0.01, atol=0.01)
+    torch_assert_close(x_amax, x_amax_ref, rtol=0.01, atol=0.01)
+
+    from tilelang.profiler import do_bench
+
+    def run_tilelang():
+        kernel(x, batch_sizes)
+
+    return do_bench(run_tilelang, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 484a092f0..693e90d30 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -7,14 +7,15 @@
 
 @tilelang.jit(out_idx=[1, 2])
 def per_token_cast_to_fp8(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
     group_size = 128
     fp8_min = -448.0
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
-                       X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
+    def per_token_cast(
+        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), T.float8_e4m3fn), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
             row_g_id = by
@@ -22,18 +23,9 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
-
-            T.copy(
-                X[row * blk_m:(row + 1) * blk_m, row_g_id * group_size:(row_g_id + 1) * group_size],
-                y_local)
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+
+            T.copy(X[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size], y_local)
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
@@ -43,9 +35,7 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e
             T.copy(y_q_local, y_q_local_fp8)
             for i in T.Parallel(blk_m):
                 X_amax[row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return per_token_cast
 
@@ -105,8 +95,7 @@ def main(M=8192, N=8192, blk_m=8):
     from example_triton_cast_to_fp8 import per_token_group_quant_fp8
 
     def run_triton():
-        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(
-            x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
+        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
         return x_fp8_triton_, x_amax_triton_
 
     x_fp8_triton, x_amax_triton = run_triton()
@@ -114,5 +103,16 @@ def run_triton():
     print("Triton: {:.2f} ms".format(latency))
 
 
+def run_regression_perf(M=8192, N=8192, blk_m=8):
+    kernel = per_token_cast_to_fp8(M, N, blk_m)
+    x = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(x)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/cast/example_triton_cast_to_fp8.py b/examples/cast/example_triton_cast_to_fp8.py
index cc56defe7..1859433f1 100644
--- a/examples/cast/example_triton_cast_to_fp8.py
+++ b/examples/cast/example_triton_cast_to_fp8.py
@@ -128,9 +128,7 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
-    assert (x.shape[-1] %
-            group_size == 0), (f"the last dimension of `x` {x.shape[-1]} must be divisible "
-                               f"by `group_size` {group_size}")
+    assert x.shape[-1] % group_size == 0, f"the last dimension of `x` {x.shape[-1]} must be divisible by `group_size` {group_size}"
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
diff --git a/examples/cast/regression_example_cast.py b/examples/cast/regression_example_cast.py
new file mode 100644
index 000000000..4bdfb99e7
--- /dev/null
+++ b/examples/cast/regression_example_cast.py
@@ -0,0 +1,17 @@
+import tilelang.testing
+import example_group_per_split_token_cast_to_fp8
+import example_per_token_cast_to_fp8
+
+
+def regression_example_group_per_split_token_cast_to_fp8():
+    tilelang.testing.process_func(
+        example_group_per_split_token_cast_to_fp8.run_regression_perf, M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896]
+    )
+
+
+def regression_example_per_token_cast_to_fp8():
+    tilelang.testing.process_func(example_per_token_cast_to_fp8.run_regression_perf, M=2048, N=512, blk_m=8)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/cast/test_example_cast.py b/examples/cast/test_example_cast.py
index 1ca000eb2..e8b10a797 100644
--- a/examples/cast/test_example_cast.py
+++ b/examples/cast/test_example_cast.py
@@ -4,8 +4,7 @@
 
 
 def test_example_group_per_split_token_cast_to_fp8():
-    example_group_per_split_token_cast_to_fp8.main(
-        M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
+    example_group_per_split_token_cast_to_fp8.main(M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
 
 
 def test_example_per_token_cast_to_fp8():
diff --git a/examples/compile_flags/usecase.py b/examples/compile_flags/usecase.py
deleted file mode 100644
index 8451b04fc..000000000
--- a/examples/compile_flags/usecase.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import tilelang
-import tilelang.language as T
-
-
-# @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
-    ):
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-M = 1024
-N = 1024
-K = 1024
-block_M = 128
-block_N = 128
-block_K = 32
-
-func = matmul(M, N, K, block_M, block_N, block_K)
-
-jit_kernel = tilelang.compile(
-    func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
-
-import torch
-
-a = torch.randn(M, K, device="cuda", dtype=torch.float16)
-b = torch.randn(K, N, device="cuda", dtype=torch.float16)
-
-c = jit_kernel(a, b)
-
-print(c)
-
-ref_c = a @ b
-
-torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-print("Kernel output matches PyTorch reference.")
diff --git a/examples/conftest.py b/examples/conftest.py
index 9f49d40a9..4010e0d83 100644
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index b2696ba8f..1599d3464 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -26,38 +25,21 @@ def main(A, B):
 
 
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                threads,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -66,12 +48,6 @@ def main(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -82,10 +58,8 @@ def main(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -97,15 +71,15 @@ def main(
 
 def main(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
 
     args = parser.parse_args(argv)
     N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
@@ -125,5 +99,30 @@ def main(argv=None):
     print("All checks passed.✅")
 
 
+def run_regression_perf(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+
+    args = parser.parse_args(argv)
+    N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
+
+    block_m = 64
+    block_n = 128
+    block_k = 32
+    num_stages = 3
+    threads = 256
+    kernel = convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index 393677489..c0c666402 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -40,7 +39,8 @@ def get_configs():
             num_stages,
             thread_num,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -50,7 +50,8 @@ def get_configs():
             "num_stages": c[3],
             "thread_num": c[4],
             "enable_rasteration": c[5],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
@@ -64,69 +65,32 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasteration,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(
+    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=thread_num) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=thread_num) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -135,11 +99,6 @@ def main(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            if is_hopper:
-                T.annotate_layout({
-                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -150,10 +109,8 @@ def main(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -166,17 +123,19 @@ def main(
     return main
 
 
-def main(n: int = 128,
-         c: int = 128,
-         h: int = 64,
-         w: int = 64,
-         f: int = 128,
-         k: int = 3,
-         s: int = 1,
-         d: int = 1,
-         p: int = 1,
-         use_autotune: bool = False,
-         with_roller: bool = True):
+def main(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
     N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
     ref_prog = ref_program(S, P, D)
 
@@ -194,27 +153,38 @@ def main(n: int = 128,
     print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
+    N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
+    config = get_heuristic_config()
+    kernel = convolution(N, C, H, W, F, K, S, D, P, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=True,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=True, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
-    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune,
-         args.with_roller)
+    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune, args.with_roller)
diff --git a/examples/convolution/regression_example_convolution.py b/examples/convolution/regression_example_convolution.py
new file mode 100644
index 000000000..18d4bcb68
--- /dev/null
+++ b/examples/convolution/regression_example_convolution.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_convolution
+import example_convolution_autotune
+
+
+def regression_example_convolution():
+    tilelang.testing.process_func(example_convolution.run_regression_perf)
+
+
+def regression_example_convolution_autotune():
+    tilelang.testing.process_func(example_convolution_autotune.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 715f09a9b..18467a811 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,11 +20,11 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float8_e4m3",
+        T.float8_e4m3fn,
     ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.float32,
     ], "Currently only float16 and float32 are supported"
 
     group_size = 128
@@ -41,18 +41,17 @@ def tl_gemm(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            scales_a: T.Tensor(Scales_A_shape, "float32"),
-            scales_b: T.Tensor(Scales_B_shape, "float32"),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        scales_a: T.Tensor(Scales_A_shape, T.float32),
+        scales_b: T.Tensor(Scales_B_shape, T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), "float32")
+            Scale_C_shared = T.alloc_shared((block_M), T.float32)
             C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
             C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
 
@@ -93,21 +92,18 @@ def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     m, n = x.shape
     x_view = x.view(m, -1, 128)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
-        m, n), (x_amax / 448.0).view(m, -1)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
 
 def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros(
-        ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
+    x_padded = torch.zeros(ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
-        x_view.size(0), x_view.size(2))
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
 
 
 def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
@@ -127,13 +123,14 @@ def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
             c_acc.zero_()
             for k in range(ceildiv(K, 128)):
                 c = torch._scaled_mm(
-                    A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
-                    B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+                    A_fp8[i * 128 : (i + 1) * 128, k * 128 : (k + 1) * 128],
+                    B_fp8[j * 128 : (j + 1) * 128, k * 128 : (k + 1) * 128].T,
                     scale_a=A_scales[i, k].view(128, 1).contiguous(),
                     scale_b=B_scales[j, k].view(1, 128).contiguous(),
-                    out_dtype=torch.bfloat16)
+                    out_dtype=torch.bfloat16,
+                )
                 c_acc += c.to(torch.float32)
-            C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+            C[i * 128 : (i + 1) * 128, j * 128 : (j + 1) * 128] = c_acc.to(out_dtype)
     return C
 
 
@@ -179,11 +176,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, T.float8_e4m3fn, T.bfloat16, T.float32)
 
 
 if __name__ == "__main__":
-    for dtype in ["float8_e4m3"]:
-        for out_dtype in ["bfloat16", "float32"]:
+    for dtype in [T.float8_e4m3fn]:
+        for out_dtype in [T.bfloat16, T.float32]:
             for block_N in [16, 32, 64, 128]:
-                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
+                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, T.float32)
diff --git a/examples/deepseek_mhc/example_mhc_post.py b/examples/deepseek_mhc/example_mhc_post.py
new file mode 100644
index 000000000..feec31bc9
--- /dev/null
+++ b/examples/deepseek_mhc/example_mhc_post.py
@@ -0,0 +1,114 @@
+import math
+
+import torch
+
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
+)
+def mhc_post_tilelang(a, b, c, d, x, hc: int, hidden: int, n_thr: int = 128, h_blk: int = 1024) -> tilelang.JITKernel:
+    # rename for shorter code
+    n = T.dynamic("num_tokens")
+    h = hidden
+
+    h_blk = math.gcd(hidden, h_blk)
+    a: T.Tensor((n, hc, hc), T.float32)
+    b: T.Tensor((n, hc, h), T.bfloat16)
+    c: T.Tensor((n, hc), T.float32)
+    d: T.Tensor((n, h), T.bfloat16)
+    x: T.Tensor((n, hc, h), T.bfloat16)
+    with T.Kernel(n, threads=n_thr) as i_n:
+        x_shared = T.alloc_shared((hc, h_blk), T.bfloat16)
+        b_shared = T.alloc_shared((hc, h_blk), T.bfloat16)
+        d_shared = T.alloc_shared(h_blk, T.bfloat16)
+
+        x_local = T.alloc_fragment((hc, h_blk), T.float32)
+        b_local = T.alloc_fragment((hc, h_blk), T.float32)
+        d_local = T.alloc_fragment(h_blk, T.float32)
+
+        a_local = T.alloc_fragment((hc, hc), T.float32)
+        c_local = T.alloc_fragment(hc, T.float32)
+        T.copy(a[i_n, 0, 0], a_local)
+        T.copy(c[i_n, 0], c_local)
+
+        for i0_h in T.Pipelined(T.ceildiv(h, h_blk), num_stages=2):
+            T.copy(b[i_n, 0, i0_h * h_blk], b_shared)
+            T.copy(d[i_n, i0_h * h_blk], d_shared)
+
+            T.copy(b_shared, b_local)
+            T.copy(d_shared, d_local)
+            for i_hco, i1_h in T.Parallel(hc, h_blk):
+                x_local[i_hco, i1_h] = c_local[i_hco] * d_local[i1_h]
+                for i_hci in T.serial(hc):
+                    x_local[i_hco, i1_h] += a_local[i_hci, i_hco] * b_local[i_hci, i1_h]
+            T.copy(x_local, x_shared)
+
+            T.copy(x_shared, x[i_n, 0, i0_h * h_blk])
+
+
+def mhc_post(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+) -> torch.Tensor:
+    out = torch.empty_like(residual)
+    mhc_post_tilelang(comb_res_mix, residual, post_layer_mix.squeeze(-1), x, out, residual.shape[-2], residual.shape[-1])
+    return out
+
+
+def mhc_post_ref(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+) -> torch.Tensor:
+    term2 = torch.bmm(comb_res_mix.mT, residual.float())
+    return (x.float().unsqueeze(-2) * post_layer_mix + term2).bfloat16()
+
+
+def generate_test_data(
+    n: int,
+    h: int,
+    hc_mult: int,
+    device: str = "cuda",
+) -> dict[str, torch.Tensor]:
+    """Generate test data for post operator."""
+    torch.random.manual_seed(42)
+
+    x = torch.randn((n, h), dtype=torch.bfloat16, device=device)
+    residual = torch.randn((n, hc_mult, h), dtype=torch.bfloat16, device=device)
+    post_layer_mix = torch.randn((n, hc_mult, 1), dtype=torch.float32, device=device)
+    comb_res_mix = torch.randn((n, hc_mult, hc_mult), dtype=torch.float32, device=device)
+
+    return {
+        "x": x,
+        "residual": residual,
+        "post_layer_mix": post_layer_mix,
+        "comb_res_mix": comb_res_mix,
+    }
+
+
+def test(n: int, h: int) -> None:
+    print(f"Testing mhc_post with {n=} {h=}")
+    test_data = generate_test_data(n=n, h=h, hc_mult=4)
+    out_tl = mhc_post(**test_data)
+    out_ref = mhc_post_ref(**test_data)
+    torch.testing.assert_close(out_tl, out_ref)
+
+
+def main():
+    for n in [4096]:
+        for h in [1280, 2560, 7168]:
+            test(n=n, h=h)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/deepseek_mhc/example_mhc_pre.py b/examples/deepseek_mhc/example_mhc_pre.py
new file mode 100644
index 000000000..9dbd66839
--- /dev/null
+++ b/examples/deepseek_mhc/example_mhc_pre.py
@@ -0,0 +1,419 @@
+import math
+
+import tilelang
+import tilelang.language as T
+import torch
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10,
+    },
+)
+def mhc_pre_big_fuse_tilelang(
+    gemm_out_mul,
+    gemm_out_sqrsum,
+    hc_scale,
+    hc_base,
+    residual,
+    post_mix,
+    comb_mix,
+    layer_input,
+    hidden_size: int,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 16,
+    hc_mult: int = 4,
+):
+    """Deeply fused kernels, everything other than gemm & sqrsum in mHC pre block."""
+    num_tokens = T.dynamic("num_tokens")
+    hc_mult3 = hc_mult * (2 + hc_mult)
+    hidden_block = math.gcd(512, hidden_size)
+
+    gemm_out_mul: T.Tensor[[n_splits, num_tokens, hc_mult3], T.float32]
+    gemm_out_sqrsum: T.Tensor[[n_splits, num_tokens], T.float32]
+    hc_scale: T.Tensor[[3], T.float32]
+    hc_base: T.Tensor[[hc_mult3], T.float32]
+    residual: T.Tensor[[num_tokens, hc_mult, hidden_size], T.bfloat16]
+    # outputs
+    post_mix: T.Tensor[[num_tokens, hc_mult], T.float32]
+    comb_mix: T.Tensor[[num_tokens, hc_mult * hc_mult], T.float32]
+    layer_input: T.Tensor[[num_tokens, hidden_size], T.bfloat16]
+
+    with T.Kernel(num_tokens, threads=96) as i:
+        ##################################################################
+        # _pre_norm_fn_fwd_norm
+        rms = T.alloc_fragment(1, T.float32)
+        mixes = T.alloc_fragment(hc_mult3, T.float32)
+        T.clear(mixes)
+        rms[0] = 0
+        for i_split in T.serial(n_splits):
+            rms[0] += gemm_out_sqrsum[i_split, i]
+        rms[0] = T.rsqrt(rms[0] / (hc_mult * hidden_size) + rms_eps)
+        for j in T.Parallel(hc_mult3):
+            mixes[j] = 0
+            for i_split in T.serial(n_splits):
+                mixes[j] += gemm_out_mul[i_split, i, j]
+            mixes[j] *= rms[0]
+        mixes_shared = T.alloc_shared(hc_mult3, T.float32)
+        T.copy(mixes, mixes_shared)
+
+        if T.get_thread_binding() < 32:
+            ##################################################################
+            # _pre_split_mixes_fwd (post & comb)
+            cm = T.alloc_fragment((hc_mult, hc_mult), T.float32)
+            for j in T.Parallel(hc_mult):
+                post_mix[i, j] = T.sigmoid(mixes_shared[j + hc_mult] * hc_scale[1] + hc_base[j + hc_mult]) * hc_post_mult_value
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = mixes_shared[j * hc_mult + k + hc_mult * 2] * hc_scale[2] + hc_base[j * hc_mult + k + hc_mult * 2]
+
+            ##################################################################
+            # _sinkhorn_fwd
+            row_sum = T.alloc_fragment(hc_mult, T.float32)
+            col_sum = T.alloc_fragment(hc_mult, T.float32)
+
+            # comb = comb.softmax(-1) + eps
+            row_max = T.alloc_fragment(hc_mult, T.float32)
+            T.reduce_max(cm, row_max, dim=1)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = T.exp(cm[j, k] - row_max[j])
+            T.reduce_sum(cm, row_sum, dim=1)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = cm[j, k] / row_sum[j] + hc_sinkhorn_eps
+
+            # comb = comb / (comb.sum(-2) + eps)
+            T.reduce_sum(cm, col_sum, dim=0)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = cm[j, k] / (col_sum[k] + hc_sinkhorn_eps)
+
+            for _ in T.serial(sinkhorn_repeat - 1):
+                # comb = comb / (comb.sum(-1) + eps)
+                T.reduce_sum(cm, row_sum, dim=1)
+                for j, k in T.Parallel(hc_mult, hc_mult):
+                    cm[j, k] = cm[j, k] / (row_sum[j] + hc_sinkhorn_eps)
+
+                # comb = comb / (comb.sum(-2) + eps)
+                T.reduce_sum(cm, col_sum, dim=0)
+                for j, k in T.Parallel(hc_mult, hc_mult):
+                    cm[j, k] = cm[j, k] / (col_sum[k] + hc_sinkhorn_eps)
+
+            # save comb_mix to global memory
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                comb_mix[i, j * hc_mult + k] = cm[j, k]
+        else:
+            ##################################################################
+            # _pre_split_mixes_fwd (pre)
+            pre_mix_shared = T.alloc_shared(hc_mult, T.float32)
+            for j in T.Parallel(hc_mult):
+                pre_mix_shared[j] = (
+                    T.sigmoid(
+                        mixes_shared[j] * hc_scale[0] + hc_base[j],
+                    )
+                    + hc_pre_eps
+                )
+            ###################################################################
+            # _pre_apply_mix_fwd
+            for i0_h in T.Pipelined(hidden_size // hidden_block, num_stages=2):
+                xs = T.alloc_shared((hc_mult, hidden_block), T.float32)
+                xl = T.alloc_fragment((hc_mult, hidden_block), T.float32)
+                T.copy(residual[i, 0, i0_h * hidden_block], xs)
+                T.copy(xs, xl)
+
+                ol = T.alloc_fragment(hidden_block, T.float32)
+                T.clear(ol)
+
+                for i_hc in T.serial(hc_mult):
+                    pre = pre_mix_shared[i_hc]
+                    for i1_h in T.Parallel(hidden_block):
+                        ol[i1_h] += pre * xl[i_hc, i1_h]
+
+                T.copy(ol, layer_input[i, i0_h * hidden_block])
+
+
+@tilelang.jit
+def mhc_pre_gemm_sqrsum_tilelang(
+    x,
+    fn,
+    out,
+    sqrsum,
+    hc_mult3: int,
+    hc_hidden_size: int,
+    token_block: int = 32,
+    hidden_block: int = 256,
+) -> tilelang.JITKernel:
+    """Not highly optimized TileLang implementation of fused gemm and sqrsum in mHC pre block."""
+    assert hc_mult3 <= 32  # should be 24 usually
+    num_tokens = T.dynamic("num_tokens")
+    assert hc_hidden_size % hidden_block == 0
+
+    x: T.Tensor((num_tokens, hc_hidden_size), T.bfloat16)
+    fn: T.Tensor((hc_mult3, hc_hidden_size), T.float32)
+    out: T.Tensor((num_tokens, hc_mult3), T.float32)
+    sqrsum: T.Tensor((num_tokens), T.float32)
+
+    with T.Kernel(T.ceildiv(num_tokens, token_block)) as px:
+        out_frag = T.alloc_fragment((token_block, 32), T.float32)
+        sqrsum_part = T.alloc_fragment((token_block, 4), T.float32)
+        T.clear(out_frag)
+        T.clear(sqrsum_part)
+        for pz in T.Pipelined(hc_hidden_size // hidden_block, num_stages=2):
+            x_smem_16 = T.alloc_shared((token_block, hidden_block), T.bfloat16)
+            fn_smem = T.alloc_shared((32, hidden_block), T.float32)
+
+            T.annotate_layout({x_smem_16: tilelang.layout.make_swizzled_layout(x_smem_16)})
+
+            T.copy(x[px * token_block, pz * hidden_block], x_smem_16)
+            T.copy(fn[0, pz * hidden_block], fn_smem)
+
+            x_frag_16 = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
+            T.copy(x_smem_16, x_frag_16)
+            x_frag = T.alloc_fragment((token_block, hidden_block), T.float32)
+            T.copy(x_frag_16, x_frag)
+
+            for jj in T.serial(hidden_block // 4):
+                for i, j in T.Parallel(token_block, 4):
+                    sqrsum_part[i, j] += x_frag[i, jj * 4 + j] * x_frag[i, jj * 4 + j]
+
+            # should be TF32 gemm
+            T.gemm(
+                x_frag,
+                fn_smem,
+                out_frag,
+                transpose_A=False,
+                transpose_B=True,
+                wg_wait=0,
+                clear_accum=False,
+            )
+        sqrsum_l = T.alloc_fragment(token_block, T.float32)
+        T.reduce_sum(sqrsum_part, sqrsum_l)
+        for i in T.Parallel(token_block):
+            sqrsum[px * token_block + i] = sqrsum_l[i]
+        for i, j in T.Parallel(token_block, 32):
+            if j < hc_mult3:
+                out[px * token_block + i, j] = out_frag[i, j]
+
+
+def mhc_pre(
+    residual: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Forward pass for mHC pre block.
+
+    Args:
+        residual: shape (..., hc_mult, hidden_size), dtype torch.bfloat16
+        fn: shape (hc_mult3, hc_mult * hidden_size), dtype torch.float32
+        hc_scale: shape (3,), dtype torch.float32
+        hc_base: shape (hc_mult3,), dtype torch.float32
+        rms_eps: RMS normalization epsilon
+        hc_pre_eps: pre-mix epsilon
+        hc_sinkhorn_eps: sinkhorn epsilon
+        hc_post_mult_value: post-mix multiplier value
+        sinkhorn_repeat: number of sinkhorn iterations
+        n_splits: split-k factor; TileLang version of mhc_pre_gemm_sqrsum doesn't support this
+
+    Returns:
+        post_mix: shape (..., hc_mult), dtype torch.float32
+        comb_mix: shape (..., hc_mult, hc_mult), dtype torch.float32
+        layer_input: shape (..., hidden_size), dtype torch.bfloat16
+    """
+
+    # Validate shapes
+    assert residual.dtype == torch.bfloat16
+    assert fn.dtype == torch.float32
+    assert hc_scale.dtype == torch.float32
+    assert hc_base.dtype == torch.float32
+
+    hc_mult = residual.shape[-2]
+    hidden_size = residual.shape[-1]
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+
+    hc_hidden_size = hc_mult * hidden_size
+    assert fn.shape[0] == hc_mult3
+    assert fn.shape[1] == hc_hidden_size
+    assert hc_scale.shape == (3,)
+    assert hc_base.shape == (hc_mult3,)
+
+    outer_shape = residual.shape[:-2]
+
+    residual_flat = residual.view(-1, hc_mult, hidden_size)
+    num_tokens = residual_flat.shape[0]
+    fn_flat = fn
+
+    post_mix = torch.empty(num_tokens, hc_mult, dtype=torch.float32, device=residual.device)
+    comb_mix = torch.empty(num_tokens, hc_mult2, dtype=torch.float32, device=residual.device)
+    layer_input = torch.empty(num_tokens, hidden_size, dtype=torch.bfloat16, device=residual.device)
+
+    gemm_out_mul = torch.empty(n_splits, num_tokens, hc_mult3, dtype=torch.float32, device=residual.device)
+    gemm_out_sqrsum = torch.empty(n_splits, num_tokens, dtype=torch.float32, device=residual.device)
+    assert n_splits == 1, "The simple TileLang version gemm_sqrsum doesn't support split-k"
+    mhc_pre_gemm_sqrsum_tilelang(
+        residual_flat.view(num_tokens, hc_mult * hidden_size),
+        fn_flat,
+        gemm_out_mul.squeeze(0),
+        gemm_out_sqrsum.squeeze(0),
+        hc_mult3,
+        hc_mult * hidden_size,
+    )
+
+    mhc_pre_big_fuse_tilelang(
+        gemm_out_mul,
+        gemm_out_sqrsum,
+        hc_scale,
+        hc_base,
+        residual_flat,
+        post_mix,
+        comb_mix,
+        layer_input,
+        hidden_size,
+        rms_eps,
+        hc_pre_eps,
+        hc_sinkhorn_eps,
+        hc_post_mult_value,
+        sinkhorn_repeat,
+        n_splits,
+        hc_mult,
+    )
+
+    post_mix = post_mix.view(*outer_shape, hc_mult, 1)
+    comb_mix = comb_mix.view(*outer_shape, hc_mult, hc_mult)
+    layer_input = layer_input.view(*outer_shape, hidden_size)
+
+    return post_mix, comb_mix, layer_input
+
+
+def sinkhorn_normalize_ref(x: torch.Tensor, repeat: int, eps: float) -> torch.Tensor:
+    x = x.softmax(-1) + eps
+    x = x / (x.sum(-2, keepdim=True) + eps)
+    for _ in range(repeat - 1):
+        x = x / (x.sum(-1, keepdim=True) + eps)
+        x = x / (x.sum(-2, keepdim=True) + eps)
+    return x
+
+
+def mhc_pre_ref(
+    residual: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    hc_mult = residual.shape[-2]
+
+    residual_flat = residual.flatten(-2, -1).float()
+    sqrsum = residual_flat.square().sum(-1)
+    mixes = residual_flat @ fn.T * (sqrsum.unsqueeze(-1) / fn.shape[-1] + rms_eps).rsqrt()
+
+    hc_scale = torch.cat(
+        [
+            hc_scale[0].expand(hc_mult),
+            hc_scale[1].expand(hc_mult),
+            hc_scale[2].expand(hc_mult * hc_mult),
+        ],
+    )
+    mixes = mixes * hc_scale + hc_base
+
+    pre_mix = mixes[:, :hc_mult].sigmoid().unsqueeze(-1) + hc_pre_eps
+    post_mix = (mixes[:, hc_mult : 2 * hc_mult].sigmoid() * hc_post_mult_value).unsqueeze(-1)
+    res_mix = mixes[:, 2 * hc_mult :].view(-1, hc_mult, hc_mult)
+
+    res_mix = sinkhorn_normalize_ref(res_mix, repeat=sinkhorn_repeat, eps=hc_sinkhorn_eps)
+
+    layer_input = (residual * pre_mix).sum(-2).bfloat16()
+
+    return post_mix, res_mix, layer_input
+
+
+def generate_test_data(
+    n: int,
+    hc_mult: int,
+    hidden_size: int,
+    rms_eps: float = 1e-6,
+    hc_pre_eps: float = 1e-6,
+    hc_sinkhorn_eps: float = 1e-6,
+    hc_post_mult_value: float = 1.0,
+    sinkhorn_repeat: int = 10,
+) -> dict[str, torch.Tensor | float]:
+    """Generate test data for big fuse operator."""
+    torch.random.manual_seed(42)
+
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+    device = "cuda"
+
+    residual = (
+        torch.randn((n, hc_mult, hidden_size), dtype=torch.float, device=device)
+        .mul(1 + torch.arange(hc_mult, device=device).mul(0.01).view(1, -1, 1))
+        .bfloat16()
+    )
+
+    fn = (
+        torch.randn((hc_mult3, hc_mult, hidden_size), dtype=torch.float, device=device)
+        * 1e-4
+        * (1 + torch.arange(hc_mult, device=device).mul(0.01).view(1, -1, 1))
+    ).flatten(1, 2)
+
+    hc_scale = torch.randn((3,), dtype=torch.float, device=device) * 0.1
+
+    hc_base = torch.randn((hc_mult3,), dtype=torch.float, device=device) * 0.1
+
+    return {
+        "residual": residual,
+        "fn": fn,
+        "hc_scale": hc_scale,
+        "hc_base": hc_base,
+        "rms_eps": rms_eps,
+        "hc_pre_eps": hc_pre_eps,
+        "hc_sinkhorn_eps": hc_sinkhorn_eps,
+        "hc_post_mult_value": hc_post_mult_value,
+        "sinkhorn_repeat": sinkhorn_repeat,
+    }
+
+
+def test(n: int, hidden_size: int, hc_mult: int) -> None:
+    print(f"Testing mhc_pre with {n=} {hidden_size=} {hc_mult=}")
+    test_data = generate_test_data(
+        n=n,
+        hc_mult=hc_mult,
+        hidden_size=hidden_size,
+    )
+
+    # Forward pass with big fuse
+    post_mix_fused, comb_mix_fused, layer_input_fused = mhc_pre(**test_data)
+
+    # Forward pass with reference
+    post_mix_ref, comb_mix_ref, layer_input_ref = mhc_pre_ref(**test_data)
+
+    # Compare outputs
+    torch.testing.assert_close(post_mix_fused, post_mix_ref)
+    torch.testing.assert_close(comb_mix_fused, comb_mix_ref)
+    torch.testing.assert_close(layer_input_fused, layer_input_ref)
+
+
+def main():
+    for n1 in [512, 1024, 2048, 8192]:
+        for hidden_size in [1280, 2560, 4096]:
+            for hc_mult in [4]:
+                test(n=n1, hidden_size=hidden_size, hc_mult=hc_mult)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/deepseek_mhc/test_example_mhc.py b/examples/deepseek_mhc/test_example_mhc.py
new file mode 100644
index 000000000..3d9ecad4d
--- /dev/null
+++ b/examples/deepseek_mhc/test_example_mhc.py
@@ -0,0 +1,18 @@
+import tilelang.testing
+
+from example_mhc_post import main as main_post
+from example_mhc_pre import main as main_pre
+
+
+@tilelang.testing.requires_cuda
+def test_mhc_post():
+    main_post()
+
+
+@tilelang.testing.requires_cuda
+def test_mhc_pre():
+    main_pre()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md
index e64b1c37d..bd3539d26 100644
--- a/examples/deepseek_mla/README.md
+++ b/examples/deepseek_mla/README.md
@@ -24,14 +24,14 @@ We benchmarked the performance of FlashMLA, TileLang, Torch, Triton, and FlashIn
   <figcaption style="text-align: center;">Figure 2：Performance under batch size=128</figcaption>
 </figure>
 
-As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton. 
+As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton.
 Notably, **TileLang accomplishes this with just around 80 lines of Python code**, demonstrating its exceptional ease of use and efficiency. Let's dive in and see how TileLang achieves this.
 
 ## Implementation
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]
@@ -54,7 +54,7 @@ Compared to traditional attention operators like MHA (Multi-Headed Attention) or
 
 This raises the question of how to partition the matrix multiplication operation. On the Hopper architecture, most computation kernels use [`wgmma.mma_async`](https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions) instructions for optimal performance. The `wgmma.mma_async` instruction organizes 4 warps (128 threads) into a warpgroup for collective MMA operations. However, `wgmma.mma_async` instructions require a minimum M dimension of 64. This means each warpgroup's minimum M dimension can only be reduced to 64, but a tile size of 64*512 is too large for a single warpgroup, leading to register spilling.
 
-Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input. 
+Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input.
 
 Our solution is to have each warpgroup compute half of `acc_s` during `Q @ K` computation, then obtain the other half computed by the other warpgroup through shared memory.
 
@@ -96,7 +96,6 @@ T.use_swizzle(panel_size: int, order: str = "row")
 
 Here, `panel_size` specifies the width of the swizzled threadblock group, and `order` determines the swizzling pattern, which can be either "row" or "col".
 
-
 ### Shared Memory Swizzling
 
 In CUDA programming, shared memory is divided into multiple memory banks, with each bank capable of servicing one thread request per clock cycle in parallel. Bank conflicts occur when multiple threads simultaneously access different addresses mapped to the same bank, forcing these accesses to be serialized and degrading performance.
@@ -113,17 +112,14 @@ T.annotate_layout({
 
 Here, `T.annotate_layout` allows users to specify any desired layout for a buffer. For convenience, TileLang provides the `make_swizzled_layout` primitive to automatically generate a swizzled layout.
 
-
 ### Warp-Specialization
 
 The Hopper architecture commonly employs warp specialization for performance optimization. A typical approach is to designate one warpgroup as a producer that handles data movement using TMA (Tensor Memory Accelerator), while the remaining warpgroups serve as consumers performing computations. However, this programming pattern is complex, requiring developers to manually manage the execution logic for producers and consumers, including synchronization through the `mbarrier` objects.
 
 In TileLang, users are completely shielded from these implementation details. The frontend script is automatically transformed into a warp-specialized form, where TileLang handles all producer-consumer synchronization automatically, enabling efficient computation.
 
-
 ### Pipeline
 
-
 Pipeline is a technique used to improve memory access efficiency by overlapping memory access and computation. In TileLang, pipeline can be implemented through the `T.pipelined` annotation:
 
 ```python
@@ -132,9 +128,8 @@ T.pipelined(range: int, stage: int)
 
 Here, `range` specifies the range of the pipeline, and `stage` specifies the stage of the pipeline. Multi-stage pipelining enables overlapping of computation and memory access, which can significantly improve performance for memory-intensive operators. However, setting a higher number of stages consumes more shared memory resources, so the optimal configuration needs to be determined based on specific use cases.
 
-
 ### Split-KV
 
 We have also implemented Split-KV optimization similar to [FlashDecoding](https://pytorch.org/blog/flash-decoding/). Specifically, when the batch size is small, parallel SM resources cannot be fully utilized due to low parallelism. In such cases, we can split the kv_ctx dimension across multiple SMs for parallel computation and then merge the results.
 
-In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
\ No newline at end of file
+In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py
new file mode 100644
index 000000000..9eae48082
--- /dev/null
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py
@@ -0,0 +1,290 @@
+# This benchmark script is modified based on: https://github.com/deepseek-ai/FlashMLA/blob/main/benchmark/bench_flash_mla.py
+# ruff: noqa
+import argparse
+import math
+import random
+import torch
+
+import triton
+import triton.language as tl
+
+import tilelang
+from tilelang.profiler import do_bench
+
+try:
+    from aiter.mla import mla_decode_fwd
+except ImportError:
+    print("aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device.")
+
+
+def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
+    query = query.float()
+    key = key.float()
+    value = value.float()
+    key = key.repeat_interleave(h_q // h_kv, dim=0)
+    value = value.repeat_interleave(h_q // h_kv, dim=0)
+    attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+    if is_causal:
+        s_q = query.shape[-2]
+        s_k = key.shape[-2]
+        attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+        attn_weight += attn_bias
+    lse = attn_weight.logsumexp(dim=-1)
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+    return attn_weight @ value, lse
+
+
+@torch.inference_mode()
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    blocked_v = blocked_k[..., :dv]
+
+    def ref_mla():
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            O, LSE = scaled_dot_product_attention(
+                q[i].transpose(0, 1),
+                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                h_q,
+                h_kv,
+                is_causal=causal,
+            )
+            out[i] = O.transpose(0, 1)
+            lse[i] = LSE
+        return out, lse
+
+    out_torch, lse_torch = ref_mla()
+    t = triton.testing.do_bench(ref_mla)
+    return out_torch, lse_torch, t
+
+
+@torch.inference_mode()
+def run_mla_aiter(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    assert d > dv, "mla with rope dim should be larger than no rope dim"
+
+    qo_indptr = torch.zeros(b + 1, dtype=torch.int)
+    kv_indptr = torch.zeros(b + 1, dtype=torch.int)
+    seq_lens_qo = torch.empty(b, dtype=torch.int)
+    seq_lens_qo.fill_(1)
+    max_seqlen_qo = seq_lens_qo.max().item()
+
+    kv_indptr[1 : b + 1] = torch.cumsum(cache_seqlens, dim=0)
+    qo_indptr[1 : b + 1] = torch.cumsum(seq_lens_qo, dim=0)
+    total_q = qo_indptr[-1].item()
+
+    # set block_size to 1
+    page_size = 1
+    kv_buffer = blocked_k.view(-1, page_size, h_kv, d)
+
+    flat_indices = []
+    for i in range(b):
+        start = i * max_seqlen_pad
+        end = start + cache_seqlens[i]
+        flat_indices.append(torch.arange(start, end, dtype=torch.int))
+
+    kv_indices = torch.cat(flat_indices)
+
+    kv_last_page_lens = torch.ones(b, dtype=torch.int)
+
+    sm_scale = 1.0 / (d**0.5)
+
+    def mla_aiter():
+        out_aiter = torch.empty((total_q, h_q, dv), dtype=dtype).fill_(-1)
+        attn_logits_aiter, attn_lse_aiter = mla_decode_fwd(
+            q.view((total_q, h_q, d)),
+            kv_buffer,
+            out_aiter,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            max_seqlen_qo,
+            sm_scale,
+        )
+        return out_aiter.view([b, s_q, h_q, dv])
+
+    out_aiter = mla_aiter()
+    t = triton.testing.do_bench(mla_aiter)
+    return out_aiter, None, t
+
+
+FUNC_TABLE = {
+    "torch": run_torch_mla,
+    "mla_aiter": run_mla_aiter,
+}
+
+
+def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    print(
+        f"comparing {baseline} vs {target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
+    )
+    device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+    assert baseline in FUNC_TABLE
+    assert target in FUNC_TABLE
+    baseline_func = FUNC_TABLE[baseline]
+    target_func = FUNC_TABLE[target]
+
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_size = 64
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+
+    torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
+    if target not in ["mla_aiter"]:
+        # flash_mla_triton doesn't return lse
+        torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
+
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.3f} TFLOPS, {bytes / 10**6 / perf_a:.3f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
+    return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
+
+
+def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
+    torch.set_default_dtype(dtype)
+    device = torch.device("cuda:0")
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+    assert target in FUNC_TABLE, f"target {target} not in {FUNC_TABLE}"
+    target_func = FUNC_TABLE[target]
+
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_size = 64
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
+    return bytes / 10**6 / perf_b
+
+
+available_targets = [
+    "torch",
+    "mla_aiter",
+]
+
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.bfloat16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline", type=str, default="torch")
+    parser.add_argument("--target", type=str, default="mla_aiter")
+    parser.add_argument("--all", action="store_true")
+    parser.add_argument("--one", action="store_true")
+    parser.add_argument("--compare", action="store_true")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    benchmark_type = "all" if args.all else f"{args.baseline}_vs_{args.target}" if args.compare else args.target
+    with open(f"{benchmark_type}_perf.csv", "w") as fout:
+        fout.write("name,batch,seqlen,head,bw\n")
+        for shape in shape_configs:
+            if args.all:
+                for target in available_targets:
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
+                    fout.write(
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
+                    )
+            elif args.compare:
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
+                fout.write(
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
+                )
+                fout.write(
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
+                )
+            elif args.one:
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
+                fout.write(
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
+                )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index db460437f..399bb8e6e 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -8,6 +8,7 @@
 
 def get_configs():
     import itertools
+
     BLOCK_N = [16, 32, 64, 128]
     BLOCK_H = [16, 32, 64, 128]
     num_split = [1, 2, 4, 8, 16, 32]
@@ -15,45 +16,44 @@ def get_configs():
 
     _configs = list(itertools.product(BLOCK_N, BLOCK_H, num_split, threads))
 
-    return [{
-        "block_N": c[0],
-        "block_H": c[1],
-        "num_split": c[2],
-        "threads": c[3],
-    } for c in _configs]
+    return [
+        {
+            "block_N": c[0],
+            "block_H": c[1],
+            "num_split": c[2],
+            "threads": c[3],
+        }
+        for c in _configs
+    ]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashmla_decode(batch,
-                    heads,
-                    kv_head_num,
-                    seqlen_kv,
-                    dim,
-                    pe_dim,
-                    block_N,
-                    block_H,
-                    num_split,
-                    threads=128):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    },
+)
+def flashmla_decode(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, threads=128):
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=threads) as (bx, by, bz):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -69,34 +69,31 @@ def flash_attn(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(seqlen_kv, block_N)
+            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = (seqlen_kv // num_split) * bz + k * block_N
+                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
+                T.copy(KV[bx, kv_start:kv_end, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
-                # T.copy(acc_s, S_shared)
                 T.copy(acc_s, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
@@ -105,20 +102,50 @@ def flash_attn(
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
+            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, by, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                lse_local_split = glse[bz, by, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim):
+                Output[bz, by, i] = o_accum_local[i]
+
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -134,34 +161,31 @@ def flash_attn_split(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                kv_start = (seqlen_kv // num_split) * bz + k * block_N
-                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
-                T.copy(KV[bx, kv_start:kv_end, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
+                # T.copy(acc_s, S_shared)
                 T.copy(acc_s, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
@@ -170,72 +194,7 @@ def flash_attn_split(
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
-            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -258,43 +217,36 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
-    parser.add_argument('--autotune', action='store_true', help='auto tune')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
+    parser.add_argument("--autotune", action="store_true", help="auto tune")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     enable_autotune = args.autotune
@@ -307,26 +259,16 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     num_split = 4
     threads = 128
 
+    print(f"Using {batch=}, {heads=}, {kv_heads=}, {kv_ctx=}, {dim=}, {pe_dim=}")
+
     if enable_autotune:
         kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
     else:
-        kernel = flashmla_decode(
-            batch,
-            heads,
-            kv_heads,
-            kv_ctx,
-            dim,
-            pe_dim,
-            BLOCK_N,
-            BLOCK_H,
-            num_split,
-            threads=threads)
+        kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, threads=threads)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     input_tensors = profiler._get_inputs()
     tilelang_output = kernel(*input_tensors)
     ref_output = ref_program(*input_tensors)
-    print(f"Tilelang output: {tilelang_output}")
-    print(f"Ref output: {ref_output}")
     torch.testing.assert_close(tilelang_output, ref_output, rtol=0.01, atol=0.01)
     latency = profiler.do_bench(warmup=500)
     print(f"Latency: {latency} ms")
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
deleted file mode 100644
index 0006d9468..000000000
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# This benchmark script is modified based on: https://github.com/deepseek-ai/FlashMLA/blob/main/benchmark/bench_flash_mla.py
-# ruff: noqa
-import argparse
-import math
-import random
-import torch
-import triton
-import triton.language as tl
-
-import tilelang
-from tilelang.profiler import do_bench
-
-
-def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
-    query = query.float()
-    key = key.float()
-    value = value.float()
-    key = key.repeat_interleave(h_q // h_kv, dim=0)
-    value = value.repeat_interleave(h_q // h_kv, dim=0)
-    attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
-    if is_causal:
-        s_q = query.shape[-2]
-        s_k = key.shape[-2]
-        attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
-        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
-        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-        attn_bias.to(query.dtype)
-        attn_weight += attn_bias
-    lse = attn_weight.logsumexp(dim=-1)
-    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
-    return attn_weight @ value, lse
-
-
-@torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
-    blocked_v = blocked_k[..., :dv]
-
-    def ref_mla():
-        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
-        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
-        for i in range(b):
-            begin = i * max_seqlen_pad
-            end = begin + cache_seqlens[i]
-            O, LSE = scaled_dot_product_attention(
-                q[i].transpose(0, 1),
-                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
-                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
-                h_q,
-                h_kv,
-                is_causal=causal,
-            )
-            out[i] = O.transpose(0, 1)
-            lse[i] = LSE
-        return out, lse
-
-    out_torch, lse_torch = ref_mla()
-    t = triton.testing.do_bench(ref_mla)
-    return out_torch, lse_torch, t
-
-
-@triton.jit
-def _mla_attn_kernel(
-    Q_nope,
-    Q_pe,
-    Kv_c_cache,
-    K_pe_cache,
-    Req_to_tokens,
-    B_seq_len,
-    O,
-    sm_scale,
-    stride_q_nope_bs,
-    stride_q_nope_h,
-    stride_q_pe_bs,
-    stride_q_pe_h,
-    stride_kv_c_bs,
-    stride_k_pe_bs,
-    stride_req_to_tokens_bs,
-    stride_o_b,
-    stride_o_h,
-    stride_o_s,
-    BLOCK_H: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    NUM_KV_SPLITS: tl.constexpr,
-    PAGE_SIZE: tl.constexpr,
-    HEAD_DIM_CKV: tl.constexpr,
-    HEAD_DIM_KPE: tl.constexpr,
-):
-    cur_batch = tl.program_id(1)
-    cur_head_id = tl.program_id(0)
-    split_kv_id = tl.program_id(2)
-
-    cur_batch_seq_len = tl.load(B_seq_len + cur_batch)
-
-    offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
-    cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
-    q_nope = tl.load(Q_nope + offs_q_nope)
-
-    offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
-    offs_q_pe = cur_batch * stride_q_pe_bs + cur_head[:, None] * stride_q_pe_h + offs_d_kpe[None, :]
-    q_pe = tl.load(Q_pe + offs_q_pe)
-
-    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
-    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_H, HEAD_DIM_CKV], dtype=tl.float32)
-
-    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
-    split_kv_start = kv_len_per_split * split_kv_id
-    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
-
-    for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
-        offs_n = start_n + tl.arange(0, BLOCK_N)
-        kv_page_number = tl.load(
-            Req_to_tokens + stride_req_to_tokens_bs * cur_batch + offs_n // PAGE_SIZE,
-            mask=offs_n < split_kv_end,
-            other=0,
-        )
-        kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
-        offs_k_c = kv_loc[None, :] * stride_kv_c_bs + offs_d_ckv[:, None]
-        k_c = tl.load(Kv_c_cache + offs_k_c, mask=offs_n[None, :] < split_kv_end, other=0.0)
-
-        qk = tl.dot(q_nope, k_c.to(q_nope.dtype))
-
-        offs_k_pe = kv_loc[None, :] * stride_k_pe_bs + offs_d_kpe[:, None]
-        k_pe = tl.load(K_pe_cache + offs_k_pe, mask=offs_n[None, :] < split_kv_end, other=0.0)
-
-        qk += tl.dot(q_pe, k_pe.to(q_pe.dtype))
-        qk *= sm_scale
-
-        qk = tl.where(offs_n[None, :] < split_kv_end, qk, float("-inf"))
-
-        v_c = tl.trans(k_c)
-
-        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
-        re_scale = tl.exp(e_max - n_e_max)
-        p = tl.exp(qk - n_e_max[:, None])
-        acc *= re_scale[:, None]
-        acc += tl.dot(p.to(v_c.dtype), v_c)
-
-        e_sum = e_sum * re_scale + tl.sum(p, 1)
-        e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
-    tl.store(O + offs_o, acc / e_sum[:, None])
-    offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
-    tl.store(O + offs_o_1, e_max + tl.log(e_sum))
-
-
-def _mla_attn(
-    q_nope,
-    q_pe,
-    kv_c_cache,
-    k_pe_cache,
-    attn_logits,
-    req_to_tokens,
-    b_seq_len,
-    num_kv_splits,
-    sm_scale,
-    page_size,
-):
-    batch_size, head_num = q_nope.shape[0], q_nope.shape[1]
-    head_dim_ckv = q_nope.shape[-1]
-    head_dim_kpe = q_pe.shape[-1]
-
-    BLOCK_H = 16
-    BLOCK_N = 64
-    grid = (
-        triton.cdiv(head_num, BLOCK_H),
-        batch_size,
-        num_kv_splits,
-    )
-    _mla_attn_kernel[grid](
-        q_nope,
-        q_pe,
-        kv_c_cache,
-        k_pe_cache,
-        req_to_tokens,
-        b_seq_len,
-        attn_logits,
-        sm_scale,
-        # stride
-        q_nope.stride(0),
-        q_nope.stride(1),
-        q_pe.stride(0),
-        q_pe.stride(1),
-        kv_c_cache.stride(-2),
-        k_pe_cache.stride(-2),
-        req_to_tokens.stride(0),
-        attn_logits.stride(0),
-        attn_logits.stride(1),
-        attn_logits.stride(2),
-        BLOCK_H=BLOCK_H,
-        BLOCK_N=BLOCK_N,
-        NUM_KV_SPLITS=num_kv_splits,
-        PAGE_SIZE=page_size,
-        HEAD_DIM_CKV=head_dim_ckv,
-        HEAD_DIM_KPE=head_dim_kpe,
-        num_stages=1,  # 2 will oom in amd
-    )
-
-
-@triton.jit
-def _mla_softmax_reducev_kernel(
-    Logits,
-    B_seq_len,
-    O,
-    stride_l_b,
-    stride_l_h,
-    stride_l_s,
-    stride_o_b,
-    stride_o_h,
-    NUM_KV_SPLITS: tl.constexpr,
-    HEAD_DIM_CKV: tl.constexpr,
-):
-    cur_batch = tl.program_id(0)
-    cur_head = tl.program_id(1)
-    cur_batch_seq_len = tl.load(B_seq_len + cur_batch)
-
-    offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
-
-    e_sum = 0.0
-    e_max = -float("inf")
-    acc = tl.zeros([HEAD_DIM_CKV], dtype=tl.float32)
-
-    offs_l = cur_batch * stride_l_b + cur_head * stride_l_h + offs_d_ckv
-    offs_l_1 = cur_batch * stride_l_b + cur_head * stride_l_h + HEAD_DIM_CKV
-
-    for split_kv_id in range(0, NUM_KV_SPLITS):
-        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
-        split_kv_start = kv_len_per_split * split_kv_id
-        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
-
-        if split_kv_end > split_kv_start:
-            logits = tl.load(Logits + offs_l + split_kv_id * stride_l_s)
-            logits_1 = tl.load(Logits + offs_l_1 + split_kv_id * stride_l_s)
-
-            n_e_max = tl.maximum(logits_1, e_max)
-            old_scale = tl.exp(e_max - n_e_max)
-            acc *= old_scale
-            exp_logic = tl.exp(logits_1 - n_e_max)
-            acc += exp_logic * logits
-
-            e_sum = e_sum * old_scale + exp_logic
-            e_max = n_e_max
-
-    tl.store(
-        O + cur_batch * stride_o_b + cur_head * stride_o_h + offs_d_ckv,
-        acc / e_sum,
-    )
-
-
-def _mla_softmax_reducev(
-    logits,
-    o,
-    b_seq_len,
-    num_kv_splits,
-):
-    batch_size, head_num, head_dim_ckv = o.shape[0], o.shape[1], o.shape[2]
-    grid = (batch_size, head_num)
-    _mla_softmax_reducev_kernel[grid](
-        logits,
-        b_seq_len,
-        o,
-        logits.stride(0),
-        logits.stride(1),
-        logits.stride(2),
-        o.stride(0),
-        o.stride(1),
-        NUM_KV_SPLITS=num_kv_splits,
-        HEAD_DIM_CKV=head_dim_ckv,
-    )
-
-
-def mla_decode_triton(
-    q_nope,
-    q_pe,
-    kv_c_cache,
-    k_pe_cache,
-    o,
-    req_to_tokens,
-    b_seq_len,
-    attn_logits,
-    num_kv_splits,
-    sm_scale,
-    page_size,
-):
-    assert num_kv_splits == attn_logits.shape[2]
-    _mla_attn(
-        q_nope,
-        q_pe,
-        kv_c_cache,
-        k_pe_cache,
-        attn_logits,
-        req_to_tokens,
-        b_seq_len,
-        num_kv_splits,
-        sm_scale,
-        page_size,
-    )
-    _mla_softmax_reducev(
-        attn_logits,
-        o,
-        b_seq_len,
-        num_kv_splits,
-    )
-
-
-@torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
-    blocked_v = blocked_k[..., :dv]
-
-    assert d > dv, "mla with rope dim should be larger than no rope dim"
-    q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
-
-    def flash_mla_triton():
-        num_kv_splits = 32
-        o = torch.empty([b * s_q, h_q, dv])
-        attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
-        mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
-        return o.view([b, s_q, h_q, dv])
-
-    out_flash = flash_mla_triton()
-    t = triton.testing.do_bench(flash_mla_triton)
-    return out_flash, None, t
-
-
-FUNC_TABLE = {
-    "torch": run_torch_mla,
-    "flash_mla_triton": run_flash_mla_triton,
-}
-
-
-def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"comparing {baseline} vs {target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
-    device = torch.device("cuda:0")
-    torch.set_default_dtype(dtype)
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-    torch.manual_seed(0)
-    random.seed(0)
-    assert baseline in FUNC_TABLE
-    assert target in FUNC_TABLE
-    baseline_func = FUNC_TABLE[baseline]
-    target_func = FUNC_TABLE[target]
-
-    total_seqlens = cache_seqlens.sum().item()
-    max_seqlen = cache_seqlens.max().item()
-    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
-    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
-
-    q = torch.randn(b, s_q, h_q, d)
-    block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
-    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
-
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-
-    torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flash_mla_triton"]:
-        # flash_mla_triton doesn't return lse
-        torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
-
-    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
-    return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
-
-
-def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
-    torch.set_default_dtype(dtype)
-    device = torch.device("cuda:0")
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-    torch.manual_seed(0)
-    random.seed(0)
-    assert target in FUNC_TABLE, f"target {target} not in {FUNC_TABLE}"
-    target_func = FUNC_TABLE[target]
-
-    total_seqlens = cache_seqlens.sum().item()
-    max_seqlen = cache_seqlens.max().item()
-    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
-    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
-
-    q = torch.randn(b, s_q, h_q, d)
-    block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
-    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
-
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-
-    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
-    return bytes / 10**6 / perf_b
-
-
-available_targets = [
-    "torch",
-    "flash_mla_triton",
-]
-
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--baseline", type=str, default="torch")
-    parser.add_argument("--target", type=str, default="torch")
-    parser.add_argument("--all", action="store_true")
-    parser.add_argument("--one", action="store_true")
-    parser.add_argument("--compare", action="store_true")
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == "__main__":
-    args = get_args()
-    benchmark_type = "all" if args.all else f"{args.baseline}_vs_{args.target}" if args.compare else args.target
-    with open(f"{benchmark_type}_perf.csv", "w") as fout:
-        fout.write("name,batch,seqlen,head,bw\n")
-        for shape in shape_configs:
-            if args.all:
-                for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
-                    fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
-                    )
-            elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
-                fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
-                )
-                fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
-                )
-            elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
-                fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
-                )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
index 644f97da1..e8c1006a0 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
@@ -29,8 +29,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -91,8 +90,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -138,9 +136,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -306,24 +302,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -359,14 +361,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -374,21 +377,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.3f} TFLOPS, {bytes / 10**6 / perf_a:.3f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -405,19 +401,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -426,26 +419,22 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [64, 128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -467,26 +456,54 @@ def get_args():
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/benchmark_mla.py b/examples/deepseek_mla/benchmark_mla.py
index a542ff611..544b5e128 100644
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
@@ -33,8 +33,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -61,8 +60,7 @@ def ref_mla():
 
 
 @torch.inference_mode()
-def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     from flash_mla import flash_mla_with_kvcache, get_mla_metadata
 
     blocked_v = blocked_k[..., :dv]
@@ -87,14 +85,13 @@ def flash_mla():
 
 
 @torch.inference_mode()
-def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                   h_q, h_kv, d, dv, causal, dtype):
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # pip install flashinfer-python
     import flashinfer
+
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     kv_indptr = [0]
     kv_indices = []
@@ -111,8 +108,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
 
-    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
-        torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
+    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
     mla_wrapper.plan(
         q_indptr,
         kv_indptr,
@@ -129,12 +125,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     )
 
     def flashinfer():
-        output, lse = mla_wrapper.run(
-            q_nope.view(-1, h_q, dv),
-            q_pe.view(-1, h_q, d - dv),
-            blocked_k_nope,
-            blocked_k_pe,
-            return_lse=True)
+        output, lse = mla_wrapper.run(q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope, blocked_k_pe, return_lse=True)
         return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
 
     out_flash, lse_flash = flashinfer()
@@ -177,8 +168,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -224,9 +214,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -393,24 +381,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -419,13 +413,10 @@ def flash_mla_triton():
 
 
 @torch.inference_mode()
-def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                           cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -434,8 +425,7 @@ def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size)
 
     def flash_mla_tilelang():
         out = kernel(
@@ -486,38 +476,31 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
         # flashinfer has a different lse return value
         # flash_mla_triton and flash_mla_tilelang doesn't return lse
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -534,19 +517,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -558,26 +538,22 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384, 32768] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]
+    for head in [128]
+]
 
 
 def get_args():
@@ -599,26 +575,54 @@ def get_args():
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 417e319fd..d6d76e54e 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -8,27 +8,31 @@
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+    },
+)
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=256) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -36,6 +40,7 @@ def flash_attn(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -44,64 +49,87 @@ def flash_attn(
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.use_swizzle(10)
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(seqlen_kv, block_N)
+            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
-                T.gemm(
-                    Q_shared,
-                    KV_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                kv_start = (seqlen_kv // num_split) * bz + k * block_N
+                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
+                T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, :])
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (hid, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, hid, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, hid, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, hid, k, i]
+                lse_local_split = glse[bz, hid, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim):
+                Output[bz, hid, i] = o_accum_local[i]
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=256) as (bid, hid, bz):
+        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -109,7 +137,6 @@ def flash_attn_split(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -118,118 +145,39 @@ def flash_attn_split(
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = (seqlen_kv // num_split) * bz + k * block_N
-                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
-                T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
-                T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.copy(KV[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
-                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                            bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (hid, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, hid, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, hid, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, hid, k, i]
-                lse_local_split[0] = glse[bz, hid, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, hid, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -252,31 +200,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -294,10 +235,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -305,14 +245,33 @@ def main(
     print(f"TFlops: {total_flops / latency * 1e-9} TFlops")
 
 
+def run_regression_perf(
+    batch=1,
+    heads=128,
+    kv_heads=1,
+    kv_ctx=8192,
+    dim=512,
+    pe_dim=64,
+):
+    BLOCK_N = 64
+    BLOCK_H = min(64, heads // kv_heads)
+    num_split = 1
+    softmax_scale = (dim + pe_dim) ** -0.5
+
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index fe50d4d4f..2e1911028 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -8,41 +8,36 @@
 
 
 @tilelang.jit(
-    out_idx=[8], pass_configs={
+    out_idx=[8],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def mla_decode_tilelang(batch,
-                        h_q,
-                        h_kv,
-                        max_seqlen_pad,
-                        dv,
-                        dpe,
-                        block_N,
-                        block_H,
-                        num_split,
-                        block_size,
-                        softmax_scale=None):
+    },
+)
+def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split, block_size, softmax_scale=None):
     if softmax_scale is None:
-        softmax_scale = (dv + dpe)**-0.5
+        softmax_scale = (dv + dpe) ** -0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = h_q // h_kv
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert h_kv == 1, "h_kv must be 1"
     assert block_size >= block_N and block_size % block_N == 0, "block_size must be larger than block_N and a multiple of block_N"
 
-    @T.macro
-    def flash_mla_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
+        # split kv
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -50,6 +45,7 @@ def flash_mla_kernel(
             K_pe_shared = T.alloc_shared([block_N, dpe], dtype)
             O_shared = T.alloc_shared([block_H, dv], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dv], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -59,69 +55,94 @@ def flash_mla_kernel(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(CACHE_SEQLENS[bx], block_N)
-            for kr in T.Pipelined(loop_range, num_stages=2):
-                k = loop_range - 1 - kr
-                kv_start = BLOCK_TABLE[bx, (k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+            total_blocks = T.ceildiv(cache_seqlens[bx], block_N)
+            blocks_per_split = T.floordiv(total_blocks, num_split)
+            remaining_blocks = T.floormod(total_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0)
+            start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
+
+            for k in T.Pipelined(loop_range, num_stages=2):
+                kv_start = block_table[bx, (start + k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                if kr == 0:
-                    for i, j in T.Parallel(block_H, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
-                                                     -T.infinity(accum_dtype), acc_s[i, j])
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dv):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
-
-    @T.macro
-    def flash_mla_split_kv_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
+
+        # combine
+        with T.Kernel(h_q, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dv], dtype)
+            o_accum_local = T.alloc_fragment([dv], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, by, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dv):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                lse_local_split = glse[bz, by, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dv):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dv):
+                Output[bz, by, i] = o_accum_local[i]
+
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        with T.Kernel(
-                batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -129,7 +150,6 @@ def flash_mla_split_kv_kernel(
             K_pe_shared = T.alloc_shared([block_N, dpe], dtype)
             O_shared = T.alloc_shared([block_H, dv], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dv], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -139,129 +159,45 @@ def flash_mla_split_kv_kernel(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            total_blocks = T.ceildiv(CACHE_SEQLENS[bx], block_N)
-            blocks_per_split = T.floordiv(total_blocks, num_split)
-            remaining_blocks = T.floormod(total_blocks, num_split)
-            loop_range = (blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0))
-            start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
-
-            for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = BLOCK_TABLE[bx, (start + k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+            loop_range = T.ceildiv(cache_seqlens[bx], block_N)
+            for kr in T.Pipelined(loop_range, num_stages=2):
+                k = loop_range - 1 - kr
+                kv_start = block_table[bx, (k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
-                                                 -T.infinity(accum_dtype), acc_s[i, j])
+                if kr == 0:
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
-                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dv):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        with T.Kernel(h_q, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dv], dtype)
-            o_accum_local = T.alloc_fragment([dv], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dv):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dv):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dv):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse,
-                                  Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        flash_mla_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, Output)
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -280,8 +216,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
         s_q = query.shape[-2]
         s_k = key.shape[-2]
         attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype, device=query.device)
-        temp_mask = torch.ones(
-            s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         attn_bias.to(query.dtype)
         attn_weight += attn_bias
@@ -291,8 +226,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # q: [b, s_q, h_q, d]
     # block_table: [b, max_seqlen_pad // block_size]
     # blocked_k: [b * max_seqlen_pad // block_size, block_size, h_kv, d]
@@ -321,13 +255,10 @@ def ref_mla():
     return out_torch
 
 
-def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                     h_q, h_kv, d, dv, causal, dtype):
-
+def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -337,8 +268,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size, softmax_scale)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
 
     def flash_mla_tilelang():
@@ -356,8 +286,7 @@ def flash_mla_tilelang():
 
     out_flash = flash_mla_tilelang()
     t = do_bench(flash_mla_tilelang)
-    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                            cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
     torch.testing.assert_close(out_flash, out_ref, rtol=0.01, atol=0.01)
     print("All close")
     return out_flash, t
@@ -365,12 +294,12 @@ def flash_mla_tilelang():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--h_q', type=int, default=128, help='q heads number')
-    parser.add_argument('--h_kv', type=int, default=1, help='kv heads number')
-    parser.add_argument('--cache_seqlen', type=int, default=8192, help='kv cache context length')
-    parser.add_argument('--d', type=int, default=576, help='query/key head dim, d = dv + dpe')
-    parser.add_argument('--dv', type=int, default=512, help='value head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--h_q", type=int, default=128, help="q heads number")
+    parser.add_argument("--h_kv", type=int, default=1, help="kv heads number")
+    parser.add_argument("--cache_seqlen", type=int, default=8192, help="kv cache context length")
+    parser.add_argument("--d", type=int, default=576, help="query/key head dim, d = dv + dpe")
+    parser.add_argument("--dv", type=int, default=512, help="value head dim")
     args = parser.parse_args()
     b, h_q, h_kv, cache_seqlen, d, dv = args.batch, args.h_q, args.h_kv, args.cache_seqlen, args.d, args.dv
 
@@ -379,9 +308,7 @@ def flash_mla_tilelang():
 
     s_q = 1  # for decode, s_q = 1
     block_size = 64
-    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)],
-                                 dtype=torch.int32,
-                                 device=device)
+    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)], dtype=torch.int32, device=device)
     dpe = d - dv
     causal = True
 
@@ -393,12 +320,11 @@ def flash_mla_tilelang():
     total_flops = s_q * total_seqlens * h_q * d * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32,
-        device=device).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32, device=device).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d, dtype=dtype, device=device)
-    out_flash, latency = run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                          s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_flash, latency = run_tilelang_mla(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 3f57ea051..74d974fbb 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -9,13 +9,15 @@
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
@@ -23,13 +25,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split_persistent(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(sm_num, threads=256) as (block_id):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -48,16 +50,11 @@ def main_split_persistent(
             logsum = T.alloc_fragment([block_H], accum_dtype)
             po_local = T.alloc_fragment([dim], dtype)
             o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
             T.use_swizzle(10)
 
             total_tiles = batch * (heads // min(block_H, kv_group_num)) * num_split
@@ -70,8 +67,8 @@ def main_split_persistent(
                 cur_kv_head = hid // (kv_group_num // block_H)
 
                 if bid < batch and hid * VALID_BLOCK_H < heads and sid < num_split:
-                    T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+                    T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
@@ -83,24 +80,15 @@ def main_split_persistent(
                         T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                         T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            KV_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
-                        T.gemm(
-                            Q_pe_shared,
-                            K_pe_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        for i in T.Parallel(block_H):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -115,11 +103,9 @@ def main_split_persistent(
                         acc_o[i, j] /= logsum[i]
                     for i in T.Parallel(block_H):
                         logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, sid])
+                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid])
                     # T.copy(acc_o, O_shared)
-                    T.copy(
-                        acc_o, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                              sid, :])
+                    T.copy(acc_o, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid, :])
 
             T.sync_grid()
             waves = T.ceildiv(heads * batch, sm_num)
@@ -130,20 +116,20 @@ def main_split_persistent(
                 if bid < batch and hid < heads:
                     T.clear(lse_logsum_local)
                     T.clear(o_accum_local)
-                    lse_max_local[0] = -T.infinity(accum_dtype)
+                    lse_max_local = -T.infinity(accum_dtype)
                     for k in T.serial(num_split):
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bid, hid, k])
+                        lse_max_local = T.max(lse_max_local, glse[bid, hid, k])
                     for k in T.Pipelined(num_split, num_stages=1):
-                        lse_local_split[0] = glse[bid, hid, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                    lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+                        lse_local_split = glse[bid, hid, k]
+                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+                    lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
                     for k in T.serial(num_split):
                         for i in T.Parallel(dim):
                             po_local[i] = Output_partial[bid, hid, k, i]
-                        lse_local_split[0] = glse[bid, hid, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                        lse_local_split = glse[bid, hid, k]
+                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
                         for i in T.Parallel(dim):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
+                            o_accum_local[i] += po_local[i] * scale_local
                     for i in T.Parallel(dim):
                         Output[bid, hid, i] = o_accum_local[i]
 
@@ -165,42 +151,35 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 6554d57de..32eb0d475 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -13,30 +13,38 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     sm_scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=384) as (bid, hid, bz):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -75,16 +83,16 @@ def flash_attn(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -105,6 +113,8 @@ def flash_attn(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -137,6 +147,8 @@ def flash_attn(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -162,8 +174,8 @@ def flash_attn(
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          0:dim // 2])
+                T.copy(O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, 0 : dim // 2])
+                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -193,8 +205,7 @@ def flash_attn(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          dim // 2:dim])
+                T.copy(O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -203,59 +214,82 @@ def flash_attn(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (hid, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, hid, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, hid, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, hid, k, i]
+                lse_local_split = glse[bz, hid, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim):
+                Output[bz, hid, i] = o_accum_local[i]
+
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=384) as (bid, hid, bz):
+        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -294,16 +328,16 @@ def flash_attn_split(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -323,7 +357,9 @@ def flash_attn_split(
                         T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
 
                     T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    T.reduce_max(acc_s, out=m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -356,6 +392,8 @@ def flash_attn_split(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -381,10 +419,7 @@ def flash_attn_split(
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(
-                    O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, 0:dim // 2])
-                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -414,9 +449,7 @@ def flash_attn_split(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(
-                    O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, dim // 2:dim])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -425,111 +458,43 @@ def flash_attn_split(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (hid, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, hid, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, hid, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, hid, k, i]
-                lse_local_split[0] = glse[bz, hid, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, hid, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
-
     if num_split > 1:
         return main_split
     else:
@@ -551,31 +516,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -593,10 +551,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -606,12 +563,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 1b1447e88..e70c35349 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -8,25 +8,27 @@
 
 
 @tilelang.jit(
-    out_idx=[-1], pass_configs={
+    out_idx=[-1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    q_dtype = "float8_e4m3"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    q_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -46,34 +48,27 @@ def main_no_split(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], qKV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.copy(qKV_shared, KV_shared)
 
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -88,7 +83,7 @@ def main_no_split(
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     return main_no_split
 
@@ -106,42 +101,35 @@ def ref_program(q, q_pe, kv, k_pe):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/regression_example_mla_decode.py b/examples/deepseek_mla/regression_example_mla_decode.py
new file mode 100644
index 000000000..64e1c436a
--- /dev/null
+++ b/examples/deepseek_mla/regression_example_mla_decode.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_mla_decode
+
+
+def regression_example_mla_decode():
+    tilelang.testing.process_func(example_mla_decode.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
index 66a750f7d..a269ea57a 100644
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
@@ -1,5 +1,4 @@
 import tilelang.testing
-
 import example_mla_decode
 
 
diff --git a/examples/deepseek_mla/torch_refs.py b/examples/deepseek_mla/torch_refs.py
index 4b4c888cd..aae6c7cd2 100644
--- a/examples/deepseek_mla/torch_refs.py
+++ b/examples/deepseek_mla/torch_refs.py
@@ -11,7 +11,7 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     block_N = 64
     seqlen_kv = KV.size(1)
 
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, nheads, dim), device="cuda", dtype=torch.float)
@@ -31,18 +31,20 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bhd,bkhd->bhk', Q_,
-                                 KV_[:, (seqlen_kv // num_split) * ks +
-                                     i * block_N:(seqlen_kv // num_split) * ks +
-                                     (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bhd,bkhd->bhk",
+                Q_,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             acc_s += torch.einsum(
-                'bhd,bkhd->bhk', Q_pe_,
-                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                      (i + 1) * block_N, :, :])
+                "bhd,bkhd->bhk",
+                Q_pe_,
+                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -50,9 +52,10 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
             acc_s = torch.exp2(acc_s - scores_max[:, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bhk,bkhd->bhd', acc_s_cast,
-                KV_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                    (i + 1) * block_N, :, :])
+                "bhk,bkhd->bhd",
+                acc_s_cast,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, None]
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index daee39865..ca98d01be 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -14,21 +14,44 @@
 from fla.utils import autocast_custom_fwd, contiguous
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -40,20 +63,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -66,7 +87,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -87,7 +108,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -100,8 +120,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -172,7 +191,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -195,7 +213,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -207,18 +226,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -258,44 +279,44 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -335,26 +356,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -364,14 +383,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -379,10 +395,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -404,71 +420,58 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
 
 def get_configs():
     import itertools
+
     iter_params = dict(
         block_T=[128, 256, 512],
         num_stages=[0, 1, 2, 4, 5],
         threads=[32, 64, 128, 256, 512],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def tilelang_sparse_attention(batch,
-                              heads,
-                              seq_len,
-                              dim,
-                              is_causal,
-                              scale=None,
-                              block_size=64,
-                              groups=1,
-                              selected_blocks=16,
-                              block_T=128,
-                              num_stages=2,
-                              threads=32):
+    }
+)
+def tilelang_sparse_attention(
+    batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16, block_T=128, num_stages=2, threads=32
+):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -476,9 +479,9 @@ def tilelang_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(block_T, tilelang.math.next_power_of_2(dim))
 
@@ -493,11 +496,11 @@ def tilelang_sparse_attention(batch,
 
     @T.prim_func
     def tilelang_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -514,13 +517,11 @@ def tilelang_sparse_attention(
             scores_sum = T.alloc_fragment([G], accum_dtype)
             logsum = T.alloc_fragment([G], accum_dtype)
 
-            T.annotate_layout({O_shared: tilelang.layout.make_swizzled_layout(O_shared)})
-
             i_t, i_v, i_bh = bx, by, bz
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -530,21 +531,15 @@ def tilelang_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -564,45 +559,33 @@ def tilelang_sparse_attention(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return tilelang_sparse_attention
 
 
 def generate_block_indices(batch, seq_len, heads, selected_blocks, block_size):
     """Generate random block indices for the benchmark."""
-    block_indices = torch.full((batch, seq_len, heads, selected_blocks),
-                               seq_len,
-                               dtype=torch.long,
-                               device='cuda')
+    block_indices = torch.full((batch, seq_len, heads, selected_blocks), seq_len, dtype=torch.long, device="cuda")
 
     for b in range(batch):
         for t in range(seq_len):
             for h in range(heads):
                 i_i = torch.randperm(max(1, (t // block_size)))[:selected_blocks]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
 
     return block_indices.sort(-1)[0]
 
 
-def benchmark_nsa(batch_size,
-                  seq_len,
-                  heads,
-                  head_query,
-                  dim,
-                  selected_blocks,
-                  block_size,
-                  dtype,
-                  scale,
-                  warmup=10,
-                  iterations=100,
-                  validate=False):
+def benchmark_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -628,14 +611,13 @@ def benchmark_nsa(batch_size,
     print(f"Profiler latency: {profiler_latency} ms")
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
 
     # Generate block indices
-    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks,
-                                           block_size).to(torch.int32)
+    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size).to(torch.int32)
 
     # Warmup
     for _ in range(warmup):
@@ -666,10 +648,9 @@ def benchmark_nsa(batch_size,
 
     # Validate result against reference if requested
     if validate:
-        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        block_counts = torch.randint(
-            1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
+        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
 
         ref = naive_nsa(
             q=Q,
@@ -700,22 +681,13 @@ def benchmark_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def benchmark_triton_nsa(batch_size,
-                         seq_len,
-                         heads,
-                         head_query,
-                         dim,
-                         selected_blocks,
-                         block_size,
-                         dtype,
-                         scale,
-                         warmup=10,
-                         iterations=100,
-                         validate=False):
+def benchmark_triton_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the Triton-based TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -723,18 +695,17 @@ def benchmark_triton_nsa(batch_size,
     torch.random.manual_seed(0)
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
 
     # Generate block indices
     block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size)
-    block_counts = torch.randint(
-        1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
-    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device='cuda')
+    block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
+    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device="cuda")
 
     # Warmup
     for _ in range(warmup):
@@ -750,7 +721,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
 
     # Synchronize before timing
     torch.cuda.synchronize()
@@ -770,7 +742,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
     torch.cuda.synchronize()
     end_time = time.time()
 
@@ -815,54 +788,28 @@ def benchmark_triton_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def run_benchmark_suite(impl='all'):
+def run_benchmark_suite(impl="all"):
     """Run a suite of benchmarks with different configurations."""
 
     # Define configurations to benchmark
     configs = [
         # Small model config - Note: head_query must be a multiple of heads*16 for Triton
-        {
-            "batch_size": 2,
-            "seq_len": 1024,
-            "heads": 8,
-            "head_query": 8 * 16,
-            "dim": 64,
-            "selected_blocks": 8,
-            "block_size": 32
-        },
-
+        {"batch_size": 2, "seq_len": 1024, "heads": 8, "head_query": 8 * 16, "dim": 64, "selected_blocks": 8, "block_size": 32},
         # Medium model config
-        {
-            "batch_size": 2,
-            "seq_len": 2048,
-            "heads": 16,
-            "head_query": 16 * 16,
-            "dim": 64,
-            "selected_blocks": 16,
-            "block_size": 64
-        },
-
+        {"batch_size": 2, "seq_len": 2048, "heads": 16, "head_query": 16 * 16, "dim": 64, "selected_blocks": 16, "block_size": 64},
         # Large model config
-        {
-            "batch_size": 1,
-            "seq_len": 4096,
-            "heads": 32,
-            "head_query": 32 * 16,
-            "dim": 128,
-            "selected_blocks": 32,
-            "block_size": 128
-        },
+        {"batch_size": 1, "seq_len": 4096, "heads": 32, "head_query": 32 * 16, "dim": 128, "selected_blocks": 32, "block_size": 128},
     ]
 
     results = []
     for config in configs:
         print(f"Running benchmark with config: {config}")
 
-        if impl in ['all', 'tilelang']:
+        if impl in ["all", "tilelang"]:
             print("Benchmarking TileLang implementation:")
             result = benchmark_nsa(
                 batch_size=config["batch_size"],
@@ -874,12 +821,13 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "tilelang", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all', 'triton']:
+        if impl in ["all", "triton"]:
             print("Benchmarking Triton implementation:")
             result = benchmark_triton_nsa(
                 batch_size=config["batch_size"],
@@ -891,19 +839,24 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "triton", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all']:
+        if impl in ["all"]:
             # Print comparison if both implementations were run
             tilelang_result = next(
-                r for r in results if r["impl"] == "tilelang" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "tilelang" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             triton_result = next(
-                r for r in results if r["impl"] == "triton" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "triton" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             speedup = tilelang_result["avg_time_ms"] / triton_result["avg_time_ms"]
             print(f"Speedup (Triton vs TileLang): {speedup:.2f}x")
 
@@ -921,8 +874,7 @@ def run_benchmark_suite(impl='all'):
     parser.add_argument("--dim", type=int, default=128, help="Head dimension")
     parser.add_argument("--selected_blocks", type=int, default=16, help="Number of selected blocks")
     parser.add_argument("--block_size", type=int, default=32, help="Block size")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", help="Data type (float16 or float32)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="Data type (float16 or float32)")
     parser.add_argument("--scale", type=float, default=0.1, help="Attention scale factor")
     parser.add_argument("--iterations", type=int, default=100, help="Number of iterations")
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
@@ -933,7 +885,8 @@ def run_benchmark_suite(impl='all'):
         type=str,
         default="all",
         choices=["tilelang", "triton", "all"],
-        help="Implementation to benchmark (tilelang, triton, or all)")
+        help="Implementation to benchmark (tilelang, triton, or all)",
+    )
 
     args = parser.parse_args()
 
@@ -941,13 +894,12 @@ def run_benchmark_suite(impl='all'):
     if args.impl in ["triton", "all"] and args.head_query % (args.heads * 16) != 0:
         # Adjust head_query to nearest valid value
         args.head_query = ((args.head_query // (args.heads * 16)) + 1) * (args.heads * 16)
-        print(
-            f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
+        print(f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
 
     if args.suite:
         run_benchmark_suite(impl=args.impl)
     else:
-        dtype = torch.float16 if args.dtype == "float16" else torch.float32
+        dtype = torch.float16 if args.dtype == T.float16 else torch.float32
 
         if args.impl in ["tilelang", "all"]:
             print("Benchmarking TileLang implementation:")
@@ -963,12 +915,14 @@ def run_benchmark_suite(impl='all'):
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (TileLang):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
@@ -986,11 +940,13 @@ def run_benchmark_suite(impl='all'):
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (Triton):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 8387d2271..3da285a9b 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -7,6 +7,7 @@
 import triton
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -22,7 +23,8 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tilelang_kernel_fwd(
     batch,
     heads,
@@ -34,11 +36,10 @@ def tilelang_kernel_fwd(
     groups=1,
     selected_blocks=16,
 ):
-
     from tilelang import language as T
 
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -48,9 +49,9 @@ def tilelang_kernel_fwd(
     o_slc_shape = [batch, seq_len, heads, dim]
     lse_slc_shape = [batch, seq_len, heads]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -67,12 +68,12 @@ def tilelang_kernel_fwd(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -93,7 +94,7 @@ def native_sparse_attention(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -103,12 +104,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
-                        for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                        for k, j in T.Parallel(G, BS):
+                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
@@ -124,21 +124,21 @@ def native_sparse_attention(
                     T.copy(scores_max, scores_max_prev)
                     T.fill(scores_max, -T.infinity(accum_dtype))
                     T.reduce_max(acc_s, scores_max, dim=1, clear=True)
-                    for i in T.Parallel(G):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                    for i, j in T.Parallel(G, BS):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    for k in T.Parallel(G):
+                        scores_scale[k] = T.exp2(scores_max_prev[k] * scale - scores_max[k] * scale)
+                    for k, j in T.Parallel(G, BS):
+                        acc_s[k, j] = T.exp2(acc_s[k, j] * scale - scores_max[k] * scale)
                     T.reduce_sum(acc_s, scores_sum, dim=1)
-                    for i in T.Parallel(G):
-                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    for k in T.Parallel(G):
+                        logsum[k] = logsum[k] * scores_scale[k] + scores_sum[k]
                     T.copy(acc_s, acc_s_cast)
 
                     # Rescale
-                    for i, j in T.Parallel(G, BV):
-                        acc_o[i, j] *= scores_scale[i]
+                    for k, j in T.Parallel(G, BV):
+                        acc_o[k, j] *= scores_scale[k]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
@@ -146,18 +146,20 @@ def native_sparse_attention(
             T.copy(acc_o, O_shared)
             T.copy(
                 O_shared,
-                O_slc[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV],
+                O_slc[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV],
             )
             for i in T.Parallel(G):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G:(i_h + 1) * G])
+            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G : (i_h + 1) * G])
 
     return native_sparse_attention
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dkv(
     batch,
     heads,
@@ -168,11 +170,11 @@ def tilelang_kernel_bwd_dkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -207,15 +209,15 @@ def tilelang_kernel_bwd_dkv(
 
     @T.prim_func
     def flash_bwd_dkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -238,31 +240,25 @@ def flash_bwd_dkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -273,7 +269,7 @@ def flash_bwd_dkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -282,7 +278,7 @@ def flash_bwd_dkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -296,7 +292,7 @@ def flash_bwd_dkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
                     for i, j in T.Parallel(BS, G):
                         dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
 
@@ -305,8 +301,8 @@ def flash_bwd_dkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dkv
 
@@ -321,9 +317,11 @@ def make_dq_layout(dQ):
     )
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dqkv(
     batch,
     heads,
@@ -334,11 +332,11 @@ def tilelang_kernel_bwd_dqkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -373,16 +371,16 @@ def tilelang_kernel_bwd_dqkv(
 
     @T.prim_func
     def flash_bwd_dqkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DQ: T.Tensor(dq_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DQ: T.Tensor(dq_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -406,31 +404,25 @@ def flash_bwd_dqkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -441,7 +433,7 @@ def flash_bwd_dqkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -450,7 +442,7 @@ def flash_bwd_dqkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -464,9 +456,9 @@ def flash_bwd_dqkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
-                    for i, j in T.Parallel(BS, G):
-                        dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
+                    for _i, _j in T.Parallel(BS, G):
+                        dsT_cast[_i, _j] = qkT[_i, _j] * (dsT[_i, _j] - delta[_j]) * sm_scale
 
                     # [BS, G] @ [G, BK] -> [BS, BK]
                     T.gemm(dsT_cast, Q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -480,23 +472,25 @@ def flash_bwd_dqkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dqkv
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_preprocess(
     batch,
     heads,
     seq_len,
     dim,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
     blk=32,
 ):
     from tilelang import language as T
@@ -505,9 +499,9 @@ def tilelang_kernel_preprocess(
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -516,27 +510,29 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * blk:(by + 1) * blk, bx])
+            T.copy(delta, Delta[bz, by * blk : (by + 1) * blk, bx])
 
     return flash_bwd_prep
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_block_mask(
     batch,
     heads,
     seq_len,
     selected_blocks,
     block_size,
-    dtype="int32",
+    dtype=T.int32,
 ):
     from tilelang import language as T
 
@@ -551,9 +547,9 @@ def tilelang_kernel_block_mask(
 
     @T.prim_func
     def flash_bwd_block_mask(
-            BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
-            BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
-            BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
+        BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
+        BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
+        BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, batch, heads * S) as (bx, by, bz):
             i_t, i_b, i_hs = bx, by, bz
@@ -603,9 +599,7 @@ def parallel_nsa_bwd(
     dk = torch.empty(NV, *k.shape, dtype=k.dtype, device=q.device)
     dv = torch.empty(v.shape, dtype=v.dtype, device=q.device)
 
-    block_mask = tilelang_kernel_block_mask(B, H, T, S,
-                                            BS)(block_indices.to(torch.int32),
-                                                block_counts.to(torch.int32)).to(torch.bool)
+    block_mask = tilelang_kernel_block_mask(B, H, T, S, BS)(block_indices.to(torch.int32), block_counts.to(torch.int32)).to(torch.bool)
 
     fused_qkv_bwd_kernel = tilelang_kernel_bwd_dqkv(
         batch=B,
@@ -618,8 +612,7 @@ def parallel_nsa_bwd(
         selected_blocks=S,
         scale=scale,
     )
-    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv,
-                         block_mask.to(torch.int32))
+    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv, block_mask.to(torch.int32))
 
     dq = dq.sum(0)
     dk = dk.sum(0)
@@ -628,7 +621,6 @@ def parallel_nsa_bwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -773,23 +765,21 @@ def parallel_nsa(
             Outputs of shape `[B, SEQLEN, HQ, V]` if `head_first=False` else `[B, HQ, SEQLEN, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"),
-                                     (q, k, v, block_indices))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
         g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
             block_counts = rearrange(block_counts, "b h t -> b t h")
-    assert (q.shape[2] % (k.shape[2] * 16) == 0), "Group size must be a multiple of 16 in NSA"
+    assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
@@ -814,7 +804,7 @@ def parallel_nsa(
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
     block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 58f435509..381d92493 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -16,7 +16,8 @@
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def native_sparse_attention(
     batch,
     heads,
@@ -25,18 +26,18 @@ def native_sparse_attention(
     scale=None,
     block_size=64,  # Tile size for attention computation
     groups=1,  # Grouped query attention (GQA) groups
-    selected_blocks=16  # Number of blocks to select per attention head
+    selected_blocks=16,  # Number of blocks to select per attention head
 ):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     # Modified shapes for inference (q has seq_len=1)a
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, 1, head_kv, selected_blocks]  # Changed seq_len to 1
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -53,12 +54,11 @@ def native_sparse_attention(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim] 
-            K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
-            V: T.Tensor(kv_shape, dtype),  # Same shape as K
-            BlockIndices: T.Tensor(block_indices_shape,
-                                   block_indices_dtype),  # Selected block indices
-            Output: T.Tensor(q_shape, dtype),  # Output attention tensor
+        Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim]
+        K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
+        V: T.Tensor(kv_shape, dtype),  # Same shape as K
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),  # Selected block indices
+        Output: T.Tensor(q_shape, dtype),  # Output attention tensor
     ):
         with T.Kernel(1, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             # Shared memory allocations for tile storage
@@ -82,7 +82,7 @@ def native_sparse_attention(
 
             NS = S
             # Copy Q for the single position
-            T.copy(Q[i_b, 0, i_h * G:(i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
+            T.copy(Q[i_b, 0, i_h * G : (i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -93,16 +93,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, 0, i_h, i] * BS  # Get block offset
                 if i_s >= 0:  # Skip invalid/padding blocks
                     # Load current key block to shared memory
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     # Compute QK^T attention scores
                     T.clear(acc_s)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Online softmax with numerical stability
                     # 1. Compute max for scaling
@@ -122,15 +117,14 @@ def native_sparse_attention(
                     T.copy(acc_s, acc_s_cast)
 
                     # Accumulate attention-weighted values
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             # Final normalization and output
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]  # Normalize by logsum
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, 0, i_h * G:(i_h + 1) * G,
-                                    i_v * BV:(i_v + 1) * BV])  # Changed i_t to 0
+            T.copy(O_shared, Output[i_b, 0, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])  # Changed i_t to 0
 
     return native_sparse_attention
 
@@ -149,21 +143,21 @@ def main():
         selected_blocks=S,
     )
 
-    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
 
-    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device='cuda')
-    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda')
+    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device="cuda")
+    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda")
 
-    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN_Q):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device="cuda")
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
@@ -178,5 +172,38 @@ def main():
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf():
+    B, SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 16, 1, 32, torch.float16
+    groups = HQ // H
+    SEQ_LEN_Q = 1
+    kernel = native_sparse_attention(
+        batch=B,
+        heads=HQ,
+        seq_len=SEQ_LEN,
+        dim=D,
+        block_size=block_size,
+        groups=HQ // H,
+        selected_blocks=S,
+    )
+
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    for b in range(B):
+        for t in range(SEQ_LEN_Q):
+            for h in range(H):
+                i_i = torch.randperm(max(1, (t // block_size)))[:S]
+                block_indices[b, t, h, : len(i_i)] = i_i
+    block_indices = block_indices.sort(-1)[0]
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_indices.to(torch.int32))
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index f8a7ebfb0..7b36d6e26 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -14,18 +14,11 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention(batch,
-                            heads,
-                            seq_len,
-                            dim,
-                            is_causal,
-                            scale=None,
-                            block_size=64,
-                            groups=1,
-                            selected_blocks=16):
+    },
+)
+def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -33,9 +26,9 @@ def native_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -52,11 +45,11 @@ def native_sparse_attention(batch,
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -77,7 +70,7 @@ def native_sparse_attention(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -87,21 +80,15 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -121,13 +108,13 @@ def native_sparse_attention(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention
 
@@ -148,21 +135,22 @@ def main():
     )
     print(kernel.get_kernel_source())
     torch.random.manual_seed(0)
-    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN, H), device='cuda')
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
@@ -183,5 +171,43 @@ def main():
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf():
+    B, SEQ_LEN, H, HQ, D, S, block_size, dtype, scale = 2, 64, 1, 16, 32, 1, 32, torch.float16, 0.1
+    kernel = native_sparse_attention(
+        batch=B,
+        heads=HQ,
+        seq_len=SEQ_LEN,
+        dim=D,
+        is_causal=True,
+        block_size=block_size,
+        groups=HQ // H,
+        selected_blocks=S,
+        scale=scale,
+    )
+    torch.random.manual_seed(0)
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
+    for b in range(B):
+        for t in range(SEQ_LEN):
+            for h in range(H):
+                i_i = torch.randperm(max(1, (t // block_size)))[:S]
+                block_indices[b, t, h, : len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
+    block_indices = block_indices.sort(-1)[0]
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_indices.to(torch.int32))
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index d365e7a5f..b52ebe42e 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@
 import tilelang.testing
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -21,18 +22,11 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention_varlen(batch,
-                                   heads,
-                                   c_seq_len,
-                                   dim,
-                                   is_causal,
-                                   scale=None,
-                                   block_size=64,
-                                   groups=1,
-                                   selected_blocks=16):
+    }
+)
+def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [c_seq_len, heads, dim]
     kv_shape = [c_seq_len, head_kv, dim]
@@ -44,12 +38,12 @@ def native_sparse_attention_varlen(batch,
     block_counts_shape = [c_seq_len, head_kv]
     offsets_shape = [batch + 1]
     token_indices_shape = [c_seq_len, 2]
-    block_indices_dtype = "int32"
-    block_counts_dtype = "int32"
-    offsets_dtype = "int32"
-    token_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    block_counts_dtype = T.int32
+    offsets_dtype = T.int32
+    token_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -66,14 +60,14 @@ def native_sparse_attention_varlen(batch,
 
     @T.prim_func
     def native_sparse_attention_varlen(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
-            Offsets: T.Tensor(offsets_shape, offsets_dtype),
-            TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
+        Offsets: T.Tensor(offsets_shape, offsets_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
     ):
         with T.Kernel(c_seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -100,7 +94,7 @@ def native_sparse_attention_varlen(
             current_seq_len = eos - bos
 
             NS = BlockCounts[i_t, i_h]
-            T.copy(Q[bos + i_t, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+            T.copy(Q[bos + i_t, i_h * G : (i_h + 1) * G, :BK], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -112,21 +106,15 @@ def native_sparse_attention_varlen(
                     # [BS, BK]
                     # Lei: may have some padding issues
                     # we should learn from mha varlen templates to handle this
-                    T.copy(K[bos + i_s:bos + i_s + BS, i_h, :BK], K_shared)
+                    T.copy(K[bos + i_s : bos + i_s + BS, i_h, :BK], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -146,13 +134,13 @@ def native_sparse_attention_varlen(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[bos + i_s:bos + i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[bos + i_s : bos + i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, O_slc[bos + i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, O_slc[bos + i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention_varlen
 
@@ -190,17 +178,20 @@ def parallel_nsa_fwd(
 
     o_slc = torch.empty(B, C_SEQ_LEN, HQ, V, dtype=v.dtype, device=q.device)
     kernel(
-        q.view(C_SEQ_LEN, HQ, D), k.view(C_SEQ_LEN, H, D), v.view(C_SEQ_LEN, H, D),
+        q.view(C_SEQ_LEN, HQ, D),
+        k.view(C_SEQ_LEN, H, D),
+        v.view(C_SEQ_LEN, H, D),
         o_slc.view(C_SEQ_LEN, HQ, V),
         block_indices.to(torch.int32).view(C_SEQ_LEN, H, S),
-        block_counts.to(torch.int32).view(C_SEQ_LEN, H), offsets.to(torch.int32),
-        token_indices.to(torch.int32))
+        block_counts.to(torch.int32).view(C_SEQ_LEN, H),
+        offsets.to(torch.int32),
+        token_indices.to(torch.int32),
+    )
     return o_slc
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size, scale, offsets):
         ctx.dtype = q.dtype
@@ -221,22 +212,25 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         return o_slc.to(q.dtype)
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -276,29 +270,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size,
-                                      scale, cu_seqlens)
+    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         assert False, "Window size is not supported yet"
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -306,41 +298,57 @@ def parallel_nsa(q: torch.Tensor,
     N, C_SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[:N - 1]],
-        torch.tensor([C_SEQ_LEN], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [
+                torch.tensor([0], dtype=torch.long),
+                torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[: N - 1]],
+                torch.tensor([C_SEQ_LEN], dtype=torch.long),
+            ],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
 
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_k = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_v = torch.randperm(C_SEQ_LEN, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_q].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, HQ,
-                                                               D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_k].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_v].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_k = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_v = torch.randperm(C_SEQ_LEN, device="cuda")
+    q = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_q]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, HQ, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    k = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_k]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    v = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_v]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device="cuda")
     for i in range(C_SEQ_LEN):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, tilelang.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -351,7 +359,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -362,7 +371,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/example_triton_nsa_bwd.py b/examples/deepseek_nsa/example_triton_nsa_bwd.py
index e912794a4..af05bfa70 100644
--- a/examples/deepseek_nsa/example_triton_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_bwd.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -134,7 +154,8 @@ def backward(ctx, do_slc, do_swa):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
@@ -199,37 +220,56 @@ def parallel_nsa_fwd(
     return o_slc, lse_slc, o_swa, lse_swa
 
 
-@triton.heuristics({'USE_OFFSETS': lambda args: args['offsets'] is not None})
+@triton.heuristics({"USE_OFFSETS": lambda args: args["offsets"] is not None})
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa, do_slc, do_swa, dk,
-                                dv, block_mask, offsets, chunk_indices, scale, T, B: tl.constexpr,
-                                H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr,
-                                V: tl.constexpr, M: tl.constexpr, BS: tl.constexpr,
-                                WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                                USE_OFFSETS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dkv(
+    q,
+    k,
+    v,
+    lse_slc,
+    lse_swa,
+    delta_slc,
+    delta_swa,
+    do_slc,
+    do_swa,
+    dk,
+    dv,
+    block_mask,
+    offsets,
+    chunk_indices,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    M: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+):
     i_v, i_s, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK),
-                            (1, 0))
-    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                            (BS, BV), (1, 0))
-    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1),
-                             (i_s * BS, 0), (BS, BK), (1, 0))
-    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                             (BS, BV), (1, 0))
+    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
+    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
 
     # [BS, BK]
     b_k = tl.load(p_k, boundary_check=(0, 1))
@@ -241,14 +281,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     for i in range(i_s * BS, T):
         b_m_slc = tl.load(block_mask + (bos + i) * H * M + i_h * M + i_s)
         if b_m_slc:
-            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                                    (1, 0))
+            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
             # [G, BK]
             b_q = tl.load(p_q, boundary_check=(0, 1))
             b_q = (b_q * scale).to(b_q.dtype)
 
-            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                         (i_h * G, i_v * BV), (G, BV), (1, 0))
+            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
             p_lse_slc = lse_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             p_delta_slc = delta_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             # [G, BV]
@@ -272,14 +310,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
         if WS > 0:
             o_s = i_s * BS + tl.arange(0, BS)
             if max(i_s * BS, i - WS + 1) < min((i_s + 1) * BS, i + 1):
-                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0),
-                                        (G, BK), (1, 0))
+                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
                 # [G, BK]
                 b_q = tl.load(p_q, boundary_check=(0, 1))
                 b_q = (b_q * scale).to(b_q.dtype)
 
-                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                             (i_h * G, i_v * BV), (G, BV), (1, 0))
+                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
                 p_lse_swa = lse_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 p_delta_swa = delta_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 # [G, BV]
@@ -304,12 +340,19 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics(
-    {'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)})
+@triton.heuristics({"USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor)})
 @triton.jit
-def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.constexpr,
-                             H: tl.constexpr, S: tl.constexpr, BS: tl.constexpr, NS: tl.constexpr,
-                             USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_kernel_mask(
+    block_indices,
+    block_counts,
+    block_mask,
+    T: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    NS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_b, i_hs = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_h, i_s = i_hs // S, i_hs % S
 
@@ -320,31 +363,56 @@ def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.cons
         b_m = b_i * BS <= i_t
 
     if b_i < NS and b_i >= 0:
-        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i,
-                 b_m.to(block_mask.dtype.element_ty))
+        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i, b_m.to(block_mask.dtype.element_ty))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, delta_swa, do_swa, dq,
-                               scale, block_indices, block_counts, offsets, token_indices, T,
-                               B: tl.constexpr, H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr,
-                               K: tl.constexpr, V: tl.constexpr, S: tl.constexpr, BS: tl.constexpr,
-                               WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                               USE_OFFSETS: tl.constexpr, USE_BLOCK_COUNTS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dq(
+    q,
+    k,
+    v,
+    lse_slc,
+    delta_slc,
+    do_slc,
+    lse_swa,
+    delta_swa,
+    do_swa,
+    dq,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -449,27 +517,49 @@ def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, del
         tl.store(p_dq, (b_dq_slc + b_dq_swa).to(p_dq.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -484,20 +574,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -510,7 +598,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -529,13 +617,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -546,7 +633,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -593,14 +680,8 @@ def parallel_nsa_block_mask(
     block_mask = torch.zeros(B, T, H, NS, dtype=torch.bool, device=block_indices.device)
 
     parallel_nsa_kernel_mask[(T, B, H * S)](
-        block_indices=block_indices,
-        block_counts=block_counts,
-        block_mask=block_mask,
-        T=T,
-        H=H,
-        S=S,
-        BS=BS,
-        NS=NS)
+        block_indices=block_indices, block_counts=block_counts, block_mask=block_mask, T=T, H=H, S=S, BS=BS, NS=NS
+    )
     return block_mask
 
 
@@ -676,7 +757,8 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dq = dq.sum(0)
 
     if offsets is not None:
@@ -719,14 +801,14 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dk = dk.sum(0)
     return dq, dk, dv
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -749,7 +831,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -781,22 +864,25 @@ def backward(ctx, do_slc, do_swa):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -836,51 +922,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd.py b/examples/deepseek_nsa/example_triton_nsa_fwd.py
index 2c740013a..c9ab28daa 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -177,7 +197,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -200,7 +219,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -212,18 +232,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -263,51 +285,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
index 9ccbff6a4..cb4eb6d7b 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,27 +18,49 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -52,20 +75,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -78,7 +99,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -97,13 +118,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -114,7 +134,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -196,7 +216,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -219,7 +238,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -231,18 +251,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -282,29 +304,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -312,38 +332,35 @@ def parallel_nsa(q: torch.Tensor,
     N, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, T)[torch.randperm(T - 1)[:N - 1]],
-        torch.tensor([T], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [torch.tensor([0], dtype=torch.long), torch.arange(16, T)[torch.randperm(T - 1)[: N - 1]], torch.tensor([T], dtype=torch.long)],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
     # offsets.shape is [N+1]
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(T, device='cuda')
-    perm_k = torch.randperm(T, device='cuda')
-    perm_v = torch.randperm(T, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, T, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(T, device="cuda")
+    perm_k = torch.randperm(T, device="cuda")
+    perm_v = torch.randperm(T, device="cuda")
+    q = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
+    k = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    v = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    g_slc = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device="cuda")
     for i in range(T):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -354,7 +371,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -365,7 +383,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/reference.py b/examples/deepseek_nsa/reference.py
index 958d0c19e..58083108e 100644
--- a/examples/deepseek_nsa/reference.py
+++ b/examples/deepseek_nsa/reference.py
@@ -6,18 +6,20 @@
 from einops import rearrange, repeat
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -57,26 +59,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -86,14 +86,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -101,10 +98,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -126,34 +123,28 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
@@ -187,7 +178,7 @@ def naive_nsa_simple(
         o (torch.Tensor):
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -197,8 +188,8 @@ def naive_nsa_simple(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(v)
@@ -228,10 +219,10 @@ def naive_nsa_simple(
                     v_i[t, h] = v_b[selected_block_index, h, :]
 
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float('-inf'))
+            attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float("-inf"))
             attn = torch.softmax(attn, dim=0)
-            o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+            o[i, i_q] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
 
@@ -265,7 +256,7 @@ def naive_nsa_simple_inference(
         o (torch.Tensor):
             Outputs of shape `[B, 1, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -275,8 +266,8 @@ def naive_nsa_simple_inference(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(q)
@@ -306,9 +297,9 @@ def naive_nsa_simple_inference(
                 v_i[t, h] = v_b[selected_block_index, h, :]
 
         # [S*BS, HQ]
-        attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-        attn = attn.masked_fill((c >= s_i), float('-inf'))
+        attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+        attn = attn.masked_fill((c >= s_i), float("-inf"))
         attn = torch.softmax(attn, dim=0)
-        o[i, 0] = torch.einsum('n h, n h v -> h v', attn, v_i)
+        o[i, 0] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
diff --git a/examples/deepseek_nsa/regression_example_tilelang_nsa.py b/examples/deepseek_nsa/regression_example_tilelang_nsa.py
new file mode 100644
index 000000000..1858f045a
--- /dev/null
+++ b/examples/deepseek_nsa/regression_example_tilelang_nsa.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_tilelang_nsa_fwd
+import example_tilelang_nsa_decode
+
+
+def regression_example_tilelang_nsa_fwd():
+    tilelang.testing.process_func(example_tilelang_nsa_fwd.run_regression_perf)
+
+
+def regression_example_tilelang_nsa_fwd_decode():
+    tilelang.testing.process_func(example_tilelang_nsa_decode.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_nsa/requirements.txt b/examples/deepseek_nsa/requirements.txt
index 777c2ad4c..e096dfd7d 100644
--- a/examples/deepseek_nsa/requirements.txt
+++ b/examples/deepseek_nsa/requirements.txt
@@ -1 +1 @@
-git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
\ No newline at end of file
+git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
diff --git a/examples/deepseek_v32/README.md b/examples/deepseek_v32/README.md
index 8457745b0..01a14b6b2 100644
--- a/examples/deepseek_v32/README.md
+++ b/examples/deepseek_v32/README.md
@@ -121,7 +121,7 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
     # ... compute attention over selected tokens
 ```
 
-This reduces compute from O(seq_len * seq_len_kv) to O(seq_len * topk). The causal mask is enforced by checking whether each index position is valid:
+This reduces compute from O(seq_len *seq_len_kv) to O(seq_len* topk). The causal mask is enforced by checking whether each index position is valid:
 
 ```python
 for bi_i in T.Parallel(BI):
@@ -193,10 +193,10 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
     # Load KV data for selected indices
     for bi_i, d_i in T.Parallel(BI, D):
         KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i]
-    
+
     # Recompute attention scores for backward
     T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-    
+
     # Apply softmax gradient: dP = P * (dP_raw - Delta)
     for h_i, bi_i in T.Parallel(padded_H, BI):
         acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
@@ -204,7 +204,7 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
 
 The key gradient computations are:
 - **dQ = dP @ K** (query gradients)
-- **dK = dP^T @ Q** (key gradients) 
+- **dK = dP^T @ Q** (key gradients)
 - **dV = P^T @ dO** (value gradients)
 
 **3. Atomic Sparse Updates**: Uses atomic operations for dKV accumulation:
@@ -212,7 +212,7 @@ The key gradient computations are:
 ```python
 # Atomically update dKV at selected indices
 for bi_i, d_i in T.Parallel(BI // split_store, D // 4):
-    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], 
+    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4],
                    acc_dkv_shared[bi_i, d_i * 4])
 ```
 
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index 21baa8fa8..2f8857597 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -28,11 +28,11 @@ def validate_tensor_match(a, b, tolerance=1e-8, tensor_name="tensor", should_rai
         if should_raise:
             assert False
     if not torch.isclose(
-            a.masked_fill(a_finite, 0),
-            b.masked_fill(b_finite, 0),
-            rtol=0,
-            atol=0,
-            equal_nan=True,
+        a.masked_fill(a_finite, 0),
+        b.masked_fill(b_finite, 0),
+        rtol=0,
+        atol=0,
+        equal_nan=True,
     ).all():
         display_error_message(f"{tensor_name} Error: nonfinite value mismatch")
         if should_raise:
@@ -55,13 +55,10 @@ def get_configs():
         threads=[128, 256],
         block_Q=[1, 2, 4],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 class SupplyProg:
-
     def __init__(self):
         self.tensors_dict = {}
 
@@ -88,7 +85,8 @@ def supply_prog(self, params):
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },)
+    },
+)
 def mqa_attn_return_logits(
     heads,
     index_dim,
@@ -99,9 +97,9 @@ def mqa_attn_return_logits(
 ):
     if block_Q is None:
         block_Q = 128 // heads
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
-    index_dtype = "int32"
+    dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
 
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
@@ -113,46 +111,42 @@ def mqa_attn_return_logits(
 
     @T.prim_func
     def mqa_attn_return_logits_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
-            IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
-            IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
-            Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
-            Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
+        IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
+        IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
+        IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
+        Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
+        Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_Q), threads=threads) as bx:
-
             index_q_shared = T.alloc_shared([block_Q * heads, index_dim], dtype)
             index_k_shared = T.alloc_shared([block_N, index_dim], dtype)
             index_k_scale_fragment = T.alloc_fragment([block_N], accum_dtype)
             s = T.alloc_fragment([block_N, block_Q * heads], accum_dtype)
-            s_reshaped = T.alloc_fragment([block_N, block_Q, heads], accum_dtype)
+            s_reshaped = T.reshape(s, (block_N, block_Q, heads))
             logits = T.alloc_fragment([block_N, block_Q], accum_dtype)
             weights = T.alloc_fragment([block_Q, heads], accum_dtype)
 
             seq_len_i = bx * block_Q
 
-            cu_k_s_min = T.alloc_local([1], index_dtype)
-            cu_k_e_max = T.alloc_local([1], index_dtype)
+            cu_k_s_min = T.alloc_var(index_dtype)
+            cu_k_e_max = T.alloc_var(index_dtype)
 
-            cu_k_s_min[0] = 2147483647
-            cu_k_e_max[0] = -2147483648
+            cu_k_s_min = 2147483647
+            cu_k_e_max = -2147483648
 
             for bq_i in T.serial(block_Q):
-                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_s_min = T.min(cu_k_s_min, T.min(CuSeqLenKS[seq_len_i + bq_i], seq_len_kv))
             for bq_i in T.serial(block_Q):
-                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_e_max = T.max(cu_k_e_max, T.min(CuSeqLenKE[seq_len_i + bq_i], seq_len_kv))
 
             T.copy(IndexQ[seq_len_i * heads, 0], index_q_shared)
             T.copy(Weights[seq_len_i, 0], weights)
 
-            for nbn_i in T.Pipelined(
-                    T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
-                T.copy(IndexK[cu_k_s_min[0] + nbn_i * block_N, 0], index_k_shared)
-                T.copy(IndexKScale[cu_k_s_min[0] + nbn_i * block_N], index_k_scale_fragment)
+            for nbn_i in T.Pipelined(T.ceildiv(cu_k_e_max - cu_k_s_min, block_N), num_stages=num_stages):
+                T.copy(IndexK[cu_k_s_min + nbn_i * block_N, 0], index_k_shared)
+                T.copy(IndexKScale[cu_k_s_min + nbn_i * block_N], index_k_scale_fragment)
 
                 T.gemm(
                     index_k_shared,
@@ -164,15 +158,14 @@ def mqa_attn_return_logits_kernel(
                 )
 
                 for bn_i, bq_i, h_i in T.Parallel(block_N, block_Q, heads):
-                    s_reshaped[bn_i, bq_i,
-                               h_i] = (T.max(s[bn_i, bq_i * heads + h_i], 0) *
-                                       weights[bq_i, h_i]) * index_k_scale_fragment[bn_i]
+                    s_reshaped[bn_i, bq_i, h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) * weights[bq_i, h_i]) * index_k_scale_fragment[
+                        bn_i
+                    ]
 
                 T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
 
                 for bq_i, bn_i in T.Parallel(block_Q, block_N):
-                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = (
-                        logits[bn_i, bq_i])
+                    Logits[seq_len_i + bq_i, cu_k_s_min + nbn_i * block_N + bn_i] = logits[bn_i, bq_i]
 
     return mqa_attn_return_logits_kernel
 
@@ -185,38 +178,30 @@ def clean_logits_(
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
 
-    dtype = "float"
-    indices_dtype = "int32"
+    dtype = T.float
+    indices_dtype = T.int32
 
     @T.prim_func
     def clean_logits_kernel(
-            Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, threads=threads) as bx:
             tx = T.thread_binding(0, threads, thread="threadIdx.x")
-            cu_k_s = T.alloc_local([1], indices_dtype)
-            cu_k_e = T.alloc_local([1], indices_dtype)
-            cu_k_s[0] = CuSeqLenKS[bx]
-            cu_k_e[0] = CuSeqLenKE[bx]
+            cu_k_s = CuSeqLenKS[bx]
+            cu_k_e = CuSeqLenKE[bx]
 
             for n_i in T.Pipelined(T.ceildiv(seq_len_kv, block_K)):
                 for k_i in T.serial(block_K // threads):
                     idx = n_i * block_K + k_i * threads + tx
-                    if idx < cu_k_s[0] or idx >= cu_k_e[0]:
+                    if idx < cu_k_s or idx >= cu_k_e:
                         Logits[bx, idx] = -T.infinity(dtype)
 
     return clean_logits_kernel
 
 
-def mqa_attn_return_logits_interface(q,
-                                     kv,
-                                     kv_scales,
-                                     weights,
-                                     cu_seqlen_ks,
-                                     cu_seqlen_ke,
-                                     clean_logits=True):
+def mqa_attn_return_logits_interface(q, kv, kv_scales, weights, cu_seqlen_ks, cu_seqlen_ke, clean_logits=True):
     seq_len, heads, index_dim = q.shape
     seq_len_kv = kv.shape[0]
 
@@ -238,57 +223,48 @@ def mqa_attn_return_logits_interface(q,
     return logits
 
 
-def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor,
-                       cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
+def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor, cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
     k = kv
     q = q.float()
     k = k.float()
 
     seq_len_kv = kv.shape[0]
-    mask_lo = torch.arange(0, seq_len_kv, device='cuda')[None, :] >= cu_seqlen_ks[:, None]
-    mask_hi = torch.arange(0, seq_len_kv, device='cuda')[None, :] < cu_seqlen_ke[:, None]
+    mask_lo = torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    mask_hi = torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
     mask = mask_lo & mask_hi
 
-    score = torch.einsum('mhd,nd->hmn', q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
-    logits = logits.masked_fill(~mask, float('-inf'))
+    logits = logits.masked_fill(~mask, float("-inf"))
 
     cost = mask.sum()
     return logits, cost
 
 
 def test_fp8_lighting_indexer(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
+    # initial random seed to make the performance reproducible
+    torch.manual_seed(0)
     q = torch.randn(S, H, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     kv = torch.randn(SKV, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
     p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
 
-    ks, ke = generate_random_cu_seqlens(
-        per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
 
-    logits_ref, cost_ref = ref_fp8_mqa_logits(
-        q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     q_fp8 = q.to(torch.float8_e4m3fn)
     kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
 
-    logits_tl = mqa_attn_return_logits_interface(
-        q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-    diff = validate_tensor_match(
-        logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
+    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
 
     print(f"diff: {diff}")
 
     from tilelang.profiler import do_bench
 
     def logits_fn():
-        return mqa_attn_return_logits_interface(
-            q=q_fp8,
-            kv=kv_fp8,
-            kv_scales=kv_scales,
-            weights=weights,
-            cu_seqlen_ks=ks,
-            cu_seqlen_ke=ke)
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
         logits_fn()
@@ -302,5 +278,23 @@ def logits_fn():
     print(f"cost_ref: {cost_ref}")
 
 
+def run_regression_perf(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
+    torch.manual_seed(0)
+    q = torch.randn(S, H, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
+    kv = torch.randn(SKV, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
+    weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+
+    q_fp8 = q.to(torch.float8_e4m3fn)
+    kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
+
+    from tilelang.profiler import do_bench
+
+    def logits_fn():
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+
+    return do_bench(logits_fn, backend="cupti")
+
+
 if __name__ == "__main__":
     test_fp8_lighting_indexer()
diff --git a/examples/deepseek_v32/inference/README.md b/examples/deepseek_v32/inference/README.md
index fe4cc21bb..60afe7ceb 100644
--- a/examples/deepseek_v32/inference/README.md
+++ b/examples/deepseek_v32/inference/README.md
@@ -11,4 +11,4 @@ Launch the interactive chat interface and start exploring DeepSeek's capabilitie
 ```bash
 export CONFIG=config_671B_v3.2.json
 torchrun --nproc-per-node ${MP} generate.py --ckpt-path ${SAVE_PATH} --config ${CONFIG} --interactive
-```
\ No newline at end of file
+```
diff --git a/examples/deepseek_v32/inference/config_671B_v3.2.json b/examples/deepseek_v32/inference/config_671B_v3.2.json
index be88f1cca..375aa9aa2 100644
--- a/examples/deepseek_v32/inference/config_671B_v3.2.json
+++ b/examples/deepseek_v32/inference/config_671B_v3.2.json
@@ -23,4 +23,4 @@
     "index_n_heads": 64,
     "index_head_dim": 128,
     "index_topk": 2048
-}
\ No newline at end of file
+}
diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py
index df7943918..090be7145 100644
--- a/examples/deepseek_v32/inference/convert.py
+++ b/examples/deepseek_v32/inference/convert.py
@@ -42,7 +42,7 @@ def main(hf_ckpt_path, save_path, n_experts, mp):
         save_path (str): Path to the directory where the converted checkpoint files will be saved.
         n_experts (int): Total number of experts in the model.
         mp (int): Model parallelism factor.
-        
+
     Returns:
         None
     """
diff --git a/examples/deepseek_v32/inference/kernel.py b/examples/deepseek_v32/inference/kernel.py
index 262343536..4090d4beb 100644
--- a/examples/deepseek_v32/inference/kernel.py
+++ b/examples/deepseek_v32/inference/kernel.py
@@ -11,21 +11,21 @@
     tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
 }
 
-FP8 = "float8_e4m3"
-BF16 = "bfloat16"
-FP32 = "float32"
+FP8 = T.float8_e4m3fn
+BF16 = T.bfloat16
+FP32 = T.float32
 
 
 def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
+    bits_x = T.reinterpret(x, T.uint32)
     exp_x = (bits_x >> 23) & 0xFF
     man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    return T.cast(exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0), T.int32)
 
 
 def fast_pow2(x):
     bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
+    return T.reinterpret(bits_x, T.float32)
 
 
 def fast_round_scale(amax, fp8_max_inv):
@@ -107,8 +107,8 @@ def act_quant(x: torch.Tensor,
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
+def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype=T.float32):
+    assert out_dtype in [BF16, T.float32]
 
     M = T.dynamic("M")
     group_size = 128
diff --git a/examples/deepseek_v32/inference/requirements.txt b/examples/deepseek_v32/inference/requirements.txt
index 604fed552..8c208a8b1 100644
--- a/examples/deepseek_v32/inference/requirements.txt
+++ b/examples/deepseek_v32/inference/requirements.txt
@@ -2,4 +2,4 @@ torch
 transformers
 safetensors
 fast_hadamard_transform
-tilelang==0.1.6
\ No newline at end of file
+tilelang==0.1.6
diff --git a/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py
new file mode 100644
index 000000000..0610002a6
--- /dev/null
+++ b/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py
@@ -0,0 +1,30 @@
+import tilelang.testing
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
+
+
+def regression_topk_selector():
+    tilelang.testing.process_func(topk_selector.run_regression_perf)
+
+
+def regression_fp8_lighting_indexer():
+    tilelang.testing.process_func(fp8_lighting_indexer.run_regression_perf, S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
+
+
+def regression_sparse_mla_fwd():
+    tilelang.testing.process_func(sparse_mla_fwd.run_regression_perf, S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256)
+
+
+def regression_sparse_mla_fwd_pipelined():
+    tilelang.testing.process_func(sparse_mla_fwd_pipelined.run_regression_perf, S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256)
+
+
+def regression_sparse_mla_bwd():
+    tilelang.testing.process_func(sparse_mla_bwd.run_regression_perf, S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index e7f9c6093..527de22b3 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -13,18 +13,18 @@ def preprocess(
     D,
     block_ND=32,
     num_stages=5,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     shape = [B, S, H, D]
 
     @T.prim_func
     def preprocess_kernel(
-            O: T.Tensor(shape, dtype),
-            dO: T.Tensor(shape, dtype),
-            Delta: T.Tensor([B, S, H], accum_dtype),
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([B, S, H], accum_dtype),
     ):
         with T.Kernel(H, T.ceildiv(S, block_ND), B) as (bx, by, bz):
             o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
@@ -33,16 +33,12 @@ def preprocess_kernel(
             acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
             T.clear(acc)
             for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
-                T.copy(
-                    O[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    o)
-                T.copy(
-                    dO[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    do)
+                T.copy(O[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
                 for i, j in T.Parallel(block_ND, block_ND):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * block_ND:(by + 1) * block_ND, bx])
+            T.copy(delta, Delta[bz, by * block_ND : (by + 1) * block_ND, bx])
 
     return preprocess_kernel
 
@@ -56,22 +52,22 @@ def postprocess(
     kv_group=1,
     block_N=64,
     threads=128,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     dkv_shape = [B, S_kv, kv_group, D + D_tail]
 
     @T.prim_func
     def postprocess_kernel(
-            dKV: T.Tensor(dkv_shape, accum_dtype),
-            dKV_out: T.Tensor(dkv_shape, dtype),
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, B, threads=threads) as (bx, by, bz):
             T.copy(
-                dKV[bz, bx * block_N:(bx + 1) * block_N, by, :],
-                dKV_out[bz, bx * block_N:(bx + 1) * block_N, by, :],
+                dKV[bz, bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bz, bx * block_N : (bx + 1) * block_N, by, :],
             )
 
     return postprocess_kernel
@@ -82,7 +78,9 @@ def postprocess_kernel(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+        tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
+    },
+)
 def bwd(
     B,
     S,
@@ -97,18 +95,18 @@ def bwd(
     block_size=32,
     num_stages=0,
     threads=256,
-    indices_dtype="int32",
-    dtype="bfloat16",
-    accum_dtype="float",
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert is_causal == True, 'non-casual is not supported now'
-    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
-    assert indices_dtype == "int32"
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
 
     if sm_scale is None:
-        sm_scale = (D + D_tail)**(-0.5)
+        sm_scale = (D + D_tail) ** (-0.5)
     sm_scale_mul_reciprocal_log2 = sm_scale * 1.44269504  # log2(e)
 
     H_kv = H // kv_group
@@ -118,12 +116,15 @@ def bwd(
     indices_shape = [B, S, kv_group, topk]
     delta_shape = [B, S, H]
     lse_shape = [B, S, H]
-    assert indices_dtype == "int32"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     H = H_kv
     padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    block_H = min(64, padded_H)
+    assert padded_H % block_H == 0
+    NH = padded_H // block_H
     BS = block_size
     NS = tilelang.cdiv(topk, block_size)
 
@@ -131,122 +132,85 @@ def bwd(
 
     @T.prim_func
     def sparse_mla_bwd_kernel(
-            Q: T.Tensor(q_shape, dtype),
-            KV: T.Tensor(k_shape, dtype),
-            dO: T.Tensor(o_shape, dtype),
-            Indices: T.Tensor(indices_shape, indices_dtype),
-            Lse: T.Tensor(lse_shape, accum_dtype),
-            Delta: T.Tensor(delta_shape, accum_dtype),
-            dQ: T.Tensor(q_shape, dtype),
-            dKV: T.Tensor(k_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
     ):
-        with T.Kernel(S, B, kv_group, threads=threads) as (s_i, by, bz):
-            Q_shared = T.alloc_shared([padded_H, D], dtype)
-            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+        with T.Kernel(S, B, kv_group * NH, threads=threads) as (s_i, by, bz):
+            Q_shared = T.alloc_shared([block_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([block_H, D_tail], dtype)
             KV_shared = T.alloc_shared([BS, D], dtype)
             KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
-            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            dO_shared = T.alloc_shared([block_H, D], dtype)
             mask = T.alloc_fragment([BS], "bool")
 
-            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
-            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
-            dQ_shared = T.alloc_shared([padded_H, D], dtype)
-            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            P_shared_cast = T.alloc_shared([block_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([block_H, BS], dtype)
+            dQ_shared = T.alloc_shared([block_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([block_H, D_tail], dtype)
 
-            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
-            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
-            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
-            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_p = T.alloc_fragment([block_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([block_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([block_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([block_H, D_tail], accum_dtype)
             acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
             acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
-            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
-            acc_dkv_tail_shared = T.view(
-                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+            acc_dkv_shared = T.alloc_shared([BS // split_store, D], accum_dtype)
+            acc_dkv_tail_shared = T.alloc_shared([BS // split_store, D_tail], accum_dtype)
 
             max_kv_i = s_i
 
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
-            T.copy(dO[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+            T.copy(Q[by, s_i, bz * block_H : (bz + 1) * block_H, :D], Q_shared)
+            T.copy(Q[by, s_i, bz * block_H : (bz + 1) * block_H, D:], Q_tail_shared)
+            T.copy(dO[by, s_i, bz * block_H : (bz + 1) * block_H, :D], dO_shared)
 
             T.clear(acc_dq)
             T.clear(acc_dq_tail)
 
-            T.annotate_layout({
-                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
-                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
-            })
-
             # Process each block of indices
             for i_i in T.Pipelined(NS, num_stages=num_stages):
                 # Check which indices are valid
                 for bi_i in T.Parallel(BS):
-                    mask[bi_i] = Indices[by, s_i, bz, i_i * BS + bi_i] <= max_kv_i
+                    mask[bi_i] = Indices[by, s_i, bz // NH, i_i * BS + bi_i] <= max_kv_i
 
                 # Compute attention scores
-                for h_i, bi_i in T.Parallel(padded_H, BS):
+                for h_i, bi_i in T.Parallel(block_H, BS):
                     acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
 
                 # Load KV, V for this block of indices
                 for bi_i, d_i in T.Parallel(BS, D):
-                    KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, d_i]
+                    KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i], bz // NH, d_i]
 
-                T.gemm(
-                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for bi_i, d_i in T.Parallel(BS, D_tail):
-                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz,
-                                                   D + d_i]
-                T.gemm(
-                    Q_tail_shared,
-                    KV_tail_shared,
-                    acc_p,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
-
-                for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 -
-                                              Lse[by, s_i, bz * padded_H + h_i])
+                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i], bz // NH, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(block_H, BS):
+                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 - Lse[by, s_i, bz * block_H + h_i])
 
                 T.copy(acc_p, P_shared_cast)
 
-                T.gemm(
-                    dO_shared,
-                    KV_shared,
-                    acc_dp,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
 
-                for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
-                        acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
+                for h_i, bi_i in T.Parallel(block_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * block_H + h_i]) * sm_scale
 
                 T.copy(acc_dp, dP_shared_cast)
                 T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
 
-                T.gemm(
-                    dP_shared_cast,
-                    Q_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    P_shared_cast,
-                    dO_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 T.clear(acc_dkv_tail)
-                T.gemm(
-                    dP_shared_cast,
-                    Q_tail_shared,
-                    acc_dkv_tail,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for s in range(split_store):
                     for bi_i, d_i in T.Parallel(BS, D):
@@ -255,41 +219,32 @@ def sparse_mla_bwd_kernel(
 
                     for bi_i, d_i in T.Parallel(BS, D_tail):
                         if bi_i < BS // split_store:
-                            acc_dkv_tail_shared[bi_i,
-                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
-                                                                    d_i]
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
                     for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, d_i * 4], acc_dkv_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i + s * (BS // split_store)], bz // NH, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
 
                     # Atomically update dKV, dKV_tail tensors
                     for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, D + d_i * 4], acc_dkv_tail_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i + s * (BS // split_store)], bz // NH, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
 
             # Store the accumulated dQ
             T.copy(acc_dq, dQ_shared)
             T.copy(acc_dq_tail, dQ_tail_shared)
 
-            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D])
-            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+            T.copy(dQ_shared, dQ[by, s_i, bz * block_H : (bz + 1) * block_H, :D])
+            T.copy(dQ_tail_shared, dQ[by, s_i, bz * block_H : (bz + 1) * block_H, D:])
 
     return sparse_mla_bwd_kernel
 
 
-def sparse_mla_bwd(q,
-                   kv,
-                   o,
-                   do,
-                   indices,
-                   lse,
-                   sm_scale=None,
-                   is_casual=True,
-                   return_kernel=False,
-                   delta=None):
+def sparse_mla_bwd(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
     assert q.is_contiguous()
     assert kv.is_contiguous()
     assert indices.is_contiguous()
@@ -322,6 +277,7 @@ def sparse_mla_bwd(q,
 
 def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True):
     from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
     q = q.detach().clone()
     kv = kv.detach().clone()
     q.requires_grad = True
@@ -331,30 +287,22 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
     return q.grad, kv.grad
 
 
-def test_sparse_mla_bwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=64,
-                        HKV=1,
-                        DQKV=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True):
+def test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True):
     # Prepare data
-    q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, S, H, DV), dtype=dtype, device='cuda')
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
     # Forward
     from sparse_mla_fwd import sparse_mla_fwd_interface
+
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
@@ -365,13 +313,15 @@ def test_sparse_mla_bwd(B=1,
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
 
-    per_token_flop = 2 * sum([
-        H * DV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DV * topk,
-    ])
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
     from tilelang.profiler import do_bench
 
     def fn():
@@ -379,20 +329,44 @@ def fn():
 
     ms = do_bench(fn, rep=100, warmup=250)
     print(f"Average time: {ms:.3f} ms")
-    print(f'bwd io bandwidth = ',
-          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+def run_regression_perf(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
+    B, S, H, dim_plus_tail_dim = q.shape
+    _, S_kv, kv_group, _ = kv.shape
+    D = 512
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    preprocess_kernel = preprocess(B, S, H, D)
+    bwd_kernel = bwd(B, S, S_kv, H, D, D_tail, topk, kv_group, None, True)
+    delta = preprocess_kernel(tl_out, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        return bwd_kernel(q, kv, do, indices, tl_lse, delta, dkv)
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
-    test_sparse_mla_bwd(
-        B=1,
-        S=4096,
-        SKV=8192,
-        H=64,
-        HKV=1,
-        DQKV=576,
-        DV=512,
-        topk=2048,
-        dtype=torch.bfloat16,
-        check_correctness=True)
+    test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index a39c72c40..ddde11f5b 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -25,15 +25,12 @@ def sparse_mla_fwd(
     num_stages=2,
     threads=256,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
     assert is_causal == True, "non-casual is not supported"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -47,17 +44,17 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -73,18 +70,17 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                    bz,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -118,16 +114,13 @@ def main(
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
                     mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] <= max_kv_i
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                                  D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -147,6 +140,8 @@ def main(
                 )
                 T.copy(m_i, m_i_prev)
                 T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                 for h_i in T.Parallel(H_per_block):
                     alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -166,23 +161,13 @@ def main(
             for h_i in T.Parallel(H_per_block):
                 sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
 
-            T.copy(acc_o, O_shared)
             T.copy(acc_o, Output[b_i, s_i, H0:H1, :])
-            T.copy(sumexp, Lse_shared)
             T.copy(sumexp, Lse[b_i, s_i, H0:H1])
 
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             sm_scale=None,
-                             return_p_sum: bool = False,
-                             d_v=512,
-                             block_I=64,
-                             num_stages=2,
-                             threads=256):
+def sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=64, num_stages=2, threads=256):
     is_casual = True
     assert return_p_sum == False, "This kernel file is for fwd only"
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
@@ -199,16 +184,8 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     kernel = sparse_mla_fwd(
-        heads,
-        dim,
-        tail_dim,
-        topk,
-        kv_group,
-        sm_scale,
-        is_casual,
-        block_I=block_I,
-        num_stages=num_stages,
-        threads=threads)
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
     out, lse = kernel(q, kv, indices)
     return out, lse
 
@@ -228,14 +205,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     b, _, _, dim_v = v.shape
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
-            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+        1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+    ).view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :1 - 1, 0] = True
+    mask[:, :, : 1 - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -250,19 +227,21 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=128,
-                        HKV=1,
-                        DQK=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True,
-                        block_I=64,
-                        num_stages=2,
-                        threads=256):
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -272,10 +251,9 @@ def test_sparse_mla_fwd(B=1,
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    tl_out, tl_lse = sparse_mla_fwd_interface(
-        q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     if check_correctness:
         # otherwise may cause out of memory
@@ -284,8 +262,7 @@ def test_sparse_mla_fwd(B=1,
         print("assert_tensors_similar passed")
 
     def fn():
-        return sparse_mla_fwd_interface(
-            q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+        return sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     from tilelang.profiler import do_bench
 
@@ -299,6 +276,36 @@ def fn():
     print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
 
 
+def run_regression_perf(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, block_I=64, num_stages=2, threads=256
+):
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    is_casual = True
+    _, _, heads, dim_plus_tail_dim = q.shape
+    _, _, kv_group, _ = kv.shape
+    dim = 512
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, _, topk = indices.shape
+    kernel = sparse_mla_fwd(heads, dim, tail_dim, topk, kv_group, None, is_casual, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    def run_kernel_only():
+        kernel(q, kv, indices)
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     test_sparse_mla_fwd(
         B=1,
@@ -313,4 +320,5 @@ def fn():
         check_correctness=True,
         block_I=64,
         num_stages=2,
-        threads=256)
+        threads=256,
+    )
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 96dda7df5..7e664d11b 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -9,10 +9,16 @@
 @tilelang.jit(
     out_idx=[-2, -1],
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
 def sparse_mla_fwd(
@@ -32,14 +38,12 @@ def sparse_mla_fwd(
     num_stages=0,
     threads=384,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
-    assert is_causal == True, 'non-casual is not supported'
-    assert topk % block_I == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -49,23 +53,25 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert kv_group == 1, 'here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)'
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
-    assert NI % 2 == 0, 'NI should be a multiple of 2'
+    assert NI % 2 == 0, "NI should be a multiple of 2"
     D = dim
     D_tail = tail_dim
     KV_stride = kv_stride
     if head_kv > 64:
-        assert head_kv % 64 == 0, 'head_kv should be a multiple of 64'
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
         REPLICATE_H = head_kv // 64
     else:
         REPLICATE_H = 1
@@ -74,18 +80,14 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            q_start_index_s: T.Tensor(1, indices_dtype),
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-            (seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H,
-                batch,
-                kv_group,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel((seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H, batch, kv_group, threads=threads) as (bx, by, bz):
             Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
@@ -110,7 +112,7 @@ def main(
             alpha_local = T.alloc_fragment([H_per_block], accum_dtype)
             m_i = T.alloc_fragment([H_per_block], accum_dtype)
             m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
-            indices_local = T.alloc_local([1], indices_dtype)
+            indices_local = T.alloc_var(indices_dtype)
 
             # TODO: Multi buffer
             bar_q = T.alloc_barrier(arrive_count=384)
@@ -122,8 +124,7 @@ def main(
             bar_sScale_and_sS_free = T.alloc_barrier(arrive_count=256)
 
             b_i, g_i = by, bz
-            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (
-                bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
             q_i = q_start_index_s[0] + s_i
             max_kv_i = (q_i + 1 - KV_stride) // KV_stride
 
@@ -132,26 +133,24 @@ def main(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[b_i, s_i, H0:H1, 0:D // 2], Q_shared_l)
-            T.copy(Q[b_i, s_i, H0:H1, D // 2:D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
                 for i_i in T.serial(T.ceildiv(NI, 2)):
-
                     # Buffer 0
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
@@ -164,6 +163,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -185,8 +186,7 @@ def main(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
@@ -198,6 +198,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -223,7 +225,7 @@ def main(
                 for h_i in T.Parallel(H_per_block):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0:D // 2])
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -253,7 +255,7 @@ def main(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2:D])
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
             elif tx >= 256:
                 # producer
                 T.set_max_nreg(80, 0)
@@ -261,70 +263,58 @@ def main(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
-                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
+                        indices_local = Indices[b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
+                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
-                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
+                        indices_local = Indices[b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             q_start_index_s,
-                             kv_stride,
-                             sm_scale=None,
-                             is_casual=True,
-                             return_kernel=False,
-                             print_kernel=False):
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
     batch, seq_len, heads, dim_plus_tail_dim = q.shape
     _, seq_len_kv, kv_group, _ = kv.shape
 
-    assert dim_plus_tail_dim == 576, 'you should assign dim otherwise'
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
     dim = 512
 
     assert kv.shape[-1] == dim_plus_tail_dim
@@ -334,29 +324,23 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     if q_start_index_s != 0:
-        assert q_start_index_s > kv_stride, "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
     CP0 = q_start_index_s == 0
 
-    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride,
-                            kv_group, sm_scale, is_casual, CP0)
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
     if print_kernel:
         print(kernel.get_kernel_source())
-    out, lse = kernel(q, kv, indices,
-                      torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    out, lse = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
     if return_kernel:
         return kernel
     if q_start_index_s == 0 and kv_stride > 1:
-        out[:, :kv_stride - 1, :, :] = 0
+        out[:, : kv_stride - 1, :, :] = 0
     return out, lse
 
 
-def ref_sparse_mla_fwd_interface(q,
-                                 kv,
-                                 indices,
-                                 q_start_index_s,
-                                 kv_stride=4,
-                                 sm_scale=None,
-                                 is_casual=True):
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=4, sm_scale=None, is_casual=True):
     q = q.float()
     kv = kv.float()
     indices = indices.transpose(1, 2)
@@ -365,7 +349,7 @@ def ref_sparse_mla_fwd_interface(q,
     if q_start_index_s is None:
         q_start_index_s = sk * kv_stride - sq
 
-    assert kv.shape[-1] == 576, 'you should assign dim otherwise'
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
     dim = 512
     k = kv
     v = kv[..., :dim]
@@ -374,15 +358,14 @@ def ref_sparse_mla_fwd_interface(q,
     num_kv_per_index = 1
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        q_start_index_s, sq + q_start_index_s, dtype=torch.int32,
-        device="cuda").view(-1, 1) >= torch.arange(
-            kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :kv_stride - 1, 0] = True
+    mask[:, :, : kv_stride - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -397,41 +380,32 @@ def ref_sparse_mla_fwd_interface(q,
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd_pipelined(B=1,
-                                  S=4096,
-                                  SKV=8192,
-                                  H=128,
-                                  HKV=1,
-                                  DQK=576,
-                                  DV=512,
-                                  topk=2048,
-                                  dtype=torch.bfloat16,
-                                  q_start_s_index=1024,
-                                  check_correctness=True):
+def test_sparse_mla_fwd_pipelined(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024, check_correctness=True
+):
     KV_stride = 1
 
     torch.random.manual_seed(0)
-    q = torch.randn((B, S, H, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
-    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
     q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
 
     q.clamp_(-10, 10)
     kv.clamp_(-10, 10)
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    kernel = sparse_mla_fwd_interface(
-        q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
 
     def fn():
         out, lse = kernel(q, kv, indices, q_start_s_index_t)
         if q_start_s_index == 0 and KV_stride > 1:
-            out[:, :KV_stride - 1, :, :] = 0
+            out[:, : KV_stride - 1, :, :] = 0
         return out, lse
 
     tl_out, tl_lse = fn()
@@ -442,14 +416,46 @@ def fn():
     torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
 
     from tilelang.profiler import do_bench
+
     ms = do_bench(
         fn,
         rep=10,
         warmup=10,
     )
     print(f"Average time: {ms:.3f} ms")
-    print(f'fwd io bandwidth = ', (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'fwd tflops = ', (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+def run_regression_perf(B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024):
+    KV_stride = 1
+
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    q.clamp_(-10, 10)
+    kv.clamp_(-10, 10)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    batch, seq_len, heads, dim_plus_tail_dim = q.shape
+    _, seq_len_kv, kv_group, _ = kv.shape
+    dim = 512
+    tail_dim = dim_plus_tail_dim - dim
+    CP0 = q_start_s_index == 0
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, KV_stride, kv_group, None, True, CP0)
+
+    def run_kernel_only():
+        kernel(q, kv, indices, torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda"))
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
@@ -460,5 +466,4 @@ def fn():
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd_pipelined(
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
+    test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/sparse_mla_fwd_seesaw.py b/examples/deepseek_v32/sparse_mla_fwd_seesaw.py
new file mode 100644
index 000000000..5d155f851
--- /dev/null
+++ b/examples/deepseek_v32/sparse_mla_fwd_seesaw.py
@@ -0,0 +1,644 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+import argparse
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    compile_flags=[
+        "-O3",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+    ],
+)
+def sparse_mla_fwd(
+    batch,
+    seq_len,
+    seq_len_kv,
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_stride,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=64,
+    num_stages=0,
+    threads=384,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
+    else:
+        sm_scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // kv_group
+    q_shape = [batch, seq_len, heads, dim + tail_dim]
+    kv_shape = [batch, seq_len_kv, kv_group, dim + tail_dim]
+    o_shape = [batch, seq_len, heads, dim]
+    indices_shape = [batch, seq_len, kv_group, topk]
+    lse_shape = [batch, seq_len, heads]
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you "
+            "should handle Q copy and Output copy with your mask (when "
+            "kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would "
+            "be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    assert NI % 2 == 0, "NI should be a multiple of 2"
+    D = dim
+    D_tail = tail_dim
+    KV_stride = kv_stride
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    # Increasing from 32->64 reduces the time spent reading kvcache. If num_query_head = 128
+    # and num_kv_head = 1, the same kvcache originally needed to be read 4 times, but now only 2 times
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(
+            # If CP0 is True (i.e., start of sequence), skip the first (KV_stride - 1)
+            # queries that cannot see any KV. Also be careful that seq_len < kv_stride could cause negative grid size
+            (max(0, seq_len - kv_stride + 1) if CP0 else seq_len) * REPLICATE_H,
+            batch,
+            kv_group,
+            threads=threads,
+        ) as (bx, by, bz):
+            Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+
+            KV_shared_0_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_0_r = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_r = T.alloc_shared([BI, D // 2], dtype)
+            K_tail_shared_0 = T.alloc_shared([BI, D_tail], dtype)
+            K_tail_shared_1 = T.alloc_shared([BI, D_tail], dtype)
+
+            O_shared_l = Q_shared_l
+            O_shared_r = Q_shared_r
+
+            # Whether the kv in current BI is visible for this query
+            # Producer alternates writing to buf0 and buf1 masks. To avoid the situation
+            # where consumer0 is still reading buf0 mask when producer has already started
+            # writing buf1 mask, we use two buf masks
+            is_kv_valid = T.alloc_shared([2, BI], "bool", scope="shared")
+
+            acc_o_l = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+            acc_o_r = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+
+            # WG0 computes S0(BI_2*i), WG1 computes S1(BI_2*i+1), shared via shared memory
+
+            # Reuse K_tail_shared for S_shared to save memory when dimensions match
+            # Must reuse, otherwise H100 SM's shared mem is insufficient (> 228kb), this is shared mem bound
+            S_shared_0 = K_tail_shared_0
+            S_shared_1 = K_tail_shared_1
+
+            # WG0 and WG1 exchange local max with each other, compare to compute global max, and rescale their O_L or O_R accordingly
+            row_max_shared_0 = T.alloc_shared([H_per_block], accum_dtype)
+            row_max_shared_1 = T.alloc_shared([H_per_block], accum_dtype)
+
+            # Used to store sum of exps for even BI and odd BI respectively, which will be summed up for integration later
+            row_sum_shared_0 = T.alloc_shared([H_per_block], accum_dtype)
+            row_sum_shared_1 = T.alloc_shared([H_per_block], accum_dtype)
+
+            # acc_s, sumexp, m_i each need to be allocated separately for consumer0 and consumer1
+            acc_s_0 = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            acc_s_1 = T.alloc_fragment([H_per_block, BI], accum_dtype)
+
+            sumexp_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_peer_0 = T.alloc_fragment([H_per_block], accum_dtype)
+
+            sumexp_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_peer_1 = T.alloc_fragment([H_per_block], accum_dtype)
+
+            bar_q = T.alloc_barrier(arrive_count=384)
+
+            # Producer -> Consumer Barriers
+            bar_k_0_ready = T.alloc_barrier(arrive_count=128)  # Prod arrives
+            bar_k_1_ready = T.alloc_barrier(arrive_count=128)  # Prod arrives
+
+            # Consumer -> Producer Barriers (Both consumers must arrive)
+            bar_k_0_free = T.alloc_barrier(arrive_count=256)
+            bar_k_1_free = T.alloc_barrier(arrive_count=256)
+
+            # Inter-Consumer Barriers (Seesaw Sync)
+            bar_stats_0_ready = T.alloc_barrier(arrive_count=128)  # Cons 0 arrives
+            bar_stats_1_ready = T.alloc_barrier(arrive_count=128)  # Cons 1 arrives
+
+            bar_S_0_ready = T.alloc_barrier(arrive_count=128)  # Cons 0 arrives
+            bar_S_1_ready = T.alloc_barrier(arrive_count=128)  # Cons 1 arrives
+
+            b_i, g_i = by, bz
+            # If it's the first chunk, start computing directly from the (kv_stride - 1)-th token
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            q_i = q_start_index_s[0] + s_i
+            # Sometimes to reduce kvcache size, we may not store KV for every token, but store
+            # KV every KV_stride tokens (usually the last token in the stride window),
+            # so the kv range visible to the current query should be [0:max_kv_i]
+            max_kv_i = (q_i + 1 - KV_stride) // KV_stride
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            tx = T.get_thread_binding()
+
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+
+            # Non-blockingly increment the barrier's internal counter, producer threads can start loading kv ahead of time
+            T.barrier_arrive(bar_q)
+
+            if tx >= 256:
+                # producer: prefetch kvcache to shared mem
+                T.set_max_nreg(72, 0)
+
+                prefetch_indices_0 = T.alloc_fragment([4], indices_dtype)
+                prefetch_indices_1 = T.alloc_fragment([4], indices_dtype)
+
+                # Prime the Pump! Prefetch indices for iter_0
+                for r in T.serial(4):
+                    # This read will cause a long scoreboard stall, but it only happens once before the loop starts
+                    prefetch_indices_0[r] = Indices[b_i, s_i, g_i, r * 16 + (tx - 256) // 8]
+                    prefetch_indices_1[r] = Indices[b_i, s_i, g_i, BI + r * 16 + (tx - 256) // 8]
+
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # Buffer 0
+                    # Wait for both KV_shared_0_l and KV_shared_0_r to be done being used
+
+                    T.barrier_wait(bar_k_0_free[0], (i_i & 1))
+
+                    # Block size `BI` is 64, loading is divided into 4 iterations, each processing 16 indices
+                    # Producer has 128 threads total, 8 consecutive threads collaborate to load kv for one index
+                    for r in T.serial(4):
+                        # mitigate long scoreboard stall here
+                        index = prefetch_indices_0[r]
+                        is_kv_valid[0, r * 16 + (tx - 256) // 8] = index <= max_kv_i
+                        if is_kv_valid[0, r * 16 + (tx - 256) // 8]:
+                            # Here we assume dim = 512, tail_dim = 64
+                            with T.attr("default", "async_scope", 1):
+                                # 8 threads collaborate to load one row of KV_dim (length 512), divided into 4 iterations
+                                # In each iteration, each thread loads 8 consecutive elements for both KV_shared_0_l
+                                # and KV_shared_0_r, 8 threads load 64 elements total for each
+                                for u in T.serial(4):
+                                    for v in T.vectorized(8):
+                                        # (tx - 256) // 8 determines which row the thread is responsible for,
+                                        # (tx - 256) % 8 determines which part of the row the thread loads
+                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, index, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, index, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                            with T.attr("default", "async_scope", 1):
+                                # tail_dim (length 64) only needs 8 threads collaborating in one iteration to complete loading
+                                for v in T.vectorized(8):
+                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, index, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
+                    T.cp_async_barrier_noinc(bar_k_0_ready[0])
+
+                    if i_i + 1 < T.ceildiv(NI, 2):
+                        # Async prefetch indices needed for the next round of kv data loading, overlaps with current round to hide latency
+                        for r in T.serial(4):
+                            prefetch_indices_0[r] = Indices[b_i, s_i, g_i, ((i_i + 1) * 2) * BI + r * 16 + (tx - 256) // 8]
+
+                    # Buffer 1
+                    T.barrier_wait(bar_k_1_free[0], (i_i & 1))
+
+                    for r in T.serial(4):
+                        index = prefetch_indices_1[r]
+                        is_kv_valid[1, r * 16 + (tx - 256) // 8] = index <= max_kv_i
+                        if is_kv_valid[1, r * 16 + (tx - 256) // 8]:
+                            with T.attr("default", "async_scope", 1):
+                                for u in T.serial(4):
+                                    for v in T.vectorized(8):
+                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, index, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, index, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                            with T.attr("default", "async_scope", 1):
+                                for v in T.vectorized(8):
+                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, index, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
+                    T.cp_async_barrier_noinc(bar_k_1_ready[0])
+
+                    if i_i + 1 < T.ceildiv(NI, 2):
+                        for r in T.serial(4):
+                            prefetch_indices_1[r] = Indices[b_i, s_i, g_i, ((i_i + 1) * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+
+            elif tx < 128:
+                # Check if 384 threads have already arrived at bar_q (phase0 completed),
+                # if not continue waiting, otherwise pass through directly
+                T.barrier_wait(bar_q, 0)
+
+                # pre-arrive free barriers to indicate buffers are initially free
+                # At the beginning of phase0, tells producer it can load data into both buffers
+                T.barrier_arrive(bar_k_0_free[0])
+                T.barrier_arrive(bar_k_1_free[0])
+
+                # Consumer 0 (WG0): Responsible for Even Blocks and O_L (Left Half)
+                T.set_max_nreg(216, 1)
+                T.fill(sumexp_0, 0)
+                for h_i in T.Parallel(H_per_block):
+                    m_i_0[h_i] = -5e4
+                T.fill(acc_o_l, 0)
+
+                # Each iteration, two consumers cooperate to compute two BIs
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # --- Step 1: Compute S0 = Q @ K0^T (Even Block) ---
+                    T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
+
+                    T.fill(acc_s_0, 0)
+                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
+                    T.gemm(Q_tail_shared, K_tail_shared_0, acc_s_0, transpose_B=True, wg_wait=-1)
+
+                    T.copy(m_i_0, m_i_prev_0)
+                    T.wait_wgmma(0)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        if not is_kv_valid[0, bi_i]:
+                            acc_s_0[h_i, bi_i] = -5e4
+                    T.reduce_max(acc_s_0, m_i_0, dim=1, clear=False)
+
+                    # --- Step 2: Local Softmax Stats & Exchange ---
+                    T.copy(m_i_0, row_max_shared_0)
+                    T.barrier_arrive(bar_stats_0_ready)
+                    # If consumer0 has received the local max from consumer1 at iter_i, this also means
+                    # consumer1 has finished using S_0 passed by consumer0 at iter_i-1,
+                    # so we can write to it directly without blocking below
+                    T.barrier_wait(bar_stats_1_ready, (i_i & 1))
+                    T.copy(row_max_shared_1, m_i_peer_0)
+
+                    # Update global max and scale O
+                    for h_i in T.Parallel(H_per_block):
+                        m_i_0[h_i] = T.max(m_i_0[h_i], m_i_peer_0[h_i])
+
+                    # Scale O_L
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_l[h_i, d_i] *= T.exp2((m_i_prev_0[h_i] - m_i_0[h_i]) * sm_scale)
+
+                    # Scale SumExp
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_0[h_i] *= T.exp2((m_i_prev_0[h_i] - m_i_0[h_i]) * sm_scale)
+
+                    # Compute P0 = exp(S0 - m_new)
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s_0[h_i, bi_i] = T.exp2(acc_s_0[h_i, bi_i] * sm_scale - m_i_0[h_i] * sm_scale)
+
+                    # Update SumExp with P0
+                    T.reduce_sum(acc_s_0, sumexp_i_0, dim=1)
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_0[h_i] += sumexp_i_0[h_i]
+
+                    # --- Step 3: O_L += P0 @ V0_L (Self-Attention) ---
+                    # Wait for S0 buffer to be free (consumed by peer in prev iter)
+                    # T.barrier_wait(bar_S_0_free, (i_i & 1))
+                    T.copy(acc_s_0, S_shared_0)
+                    T.barrier_arrive(bar_S_0_ready)
+
+                    T.gemm(S_shared_0, KV_shared_0_l, acc_o_l, transpose_B=False, wg_wait=-1)
+
+                    # --- Step 4: O_L += P1 @ V1_L (Cross-Attention) ---
+                    # Wait for P1 (S1) from peer
+                    T.barrier_wait(bar_S_1_ready, (i_i & 1))
+
+                    T.gemm(S_shared_1, KV_shared_1_l, acc_o_l, transpose_B=False, wg_wait=-1)
+
+                    # NOTE: However, k_0 and k_1 are used by both consumer0 and consumer1, so this doesn't bring much performance improvement
+                    # Except for the most recent async gemm (i.e., S_shared_1 @ KV_shared_1_k), all others need to wait to finish
+                    T.wait_wgmma(1)
+                    T.barrier_arrive(bar_k_0_free[0])
+                    # Wait for all async gemms to finish
+                    T.wait_wgmma(0)
+                    T.barrier_arrive(bar_k_1_free[0])
+
+                T.copy(sumexp_0, row_sum_shared_0)
+                T.barrier_arrive(bar_stats_0_ready)  # Reuse barrier
+                T.barrier_wait(bar_stats_1_ready, T.ceildiv(NI, 2) & 1)
+                T.copy(row_sum_shared_1, sumexp_i_0)  # Reuse sumexp_i buffer
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_0[h_i] += sumexp_i_0[h_i]
+
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_l[h_i, d_i] /= sumexp_0[h_i]
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_0[h_i] = T.log2(sumexp_0[h_i]) + m_i_0[h_i] * sm_scale
+
+                T.copy(acc_o_l, O_shared_l)
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
+                T.copy(sumexp_0, Lse[b_i, s_i, H0:H1])  # Write LSE
+
+            elif tx >= 128 and tx < 256:
+                T.barrier_wait(bar_q, 0)
+
+                # pre-arrive free barriers to indicate buffers are initially free
+                # At the beginning of phase0, tells producer it can load data into both buffers
+                T.barrier_arrive(bar_k_0_free[0])
+                T.barrier_arrive(bar_k_1_free[0])
+
+                # Consumer 1 (WG1): Responsible for Odd Blocks and O_R (Right Half)
+                # NOTE: 256 * 216 + 128 * 72 = 64,512 < 65536 (H100 SM RegFile Limit),
+                # setting more registers will cause a hang, all values must be multiples of 8
+                T.set_max_nreg(216, 1)
+                T.fill(sumexp_1, 0)
+                for h_i in T.Parallel(H_per_block):
+                    m_i_1[h_i] = -5e4
+                T.fill(acc_o_r, 0)
+
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # --- Step 1: Compute S1 = Q @ K1^T (Odd Block) ---
+                    T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
+
+                    T.fill(acc_s_1, 0)
+                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
+                    T.gemm(Q_tail_shared, K_tail_shared_1, acc_s_1, transpose_B=True, wg_wait=-1)
+
+                    # --- Step 2: Local Softmax Stats & Exchange ---
+                    T.copy(m_i_1, m_i_prev_1)
+                    T.wait_wgmma(0)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        if not is_kv_valid[1, bi_i]:
+                            acc_s_1[h_i, bi_i] = -5e4
+
+                    T.reduce_max(acc_s_1, m_i_1, dim=1, clear=False)
+                    T.copy(m_i_1, row_max_shared_1)
+                    T.barrier_arrive(bar_stats_1_ready)
+                    T.barrier_wait(bar_stats_0_ready, (i_i & 1))
+                    T.copy(row_max_shared_0, m_i_peer_1)
+
+                    for h_i in T.Parallel(H_per_block):
+                        m_i_1[h_i] = T.max(m_i_1[h_i], m_i_peer_1[h_i])
+
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_r[h_i, d_i] *= T.exp2((m_i_prev_1[h_i] - m_i_1[h_i]) * sm_scale)
+
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_1[h_i] *= T.exp2((m_i_prev_1[h_i] - m_i_1[h_i]) * sm_scale)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s_1[h_i, bi_i] = T.exp2(acc_s_1[h_i, bi_i] * sm_scale - m_i_1[h_i] * sm_scale)
+
+                    T.reduce_sum(acc_s_1, sumexp_i_1, dim=1)
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_1[h_i] += sumexp_i_1[h_i]
+
+                    # --- Step 3: O_R += P1 @ V1_R (Self-Attention) ---
+                    T.copy(acc_s_1, S_shared_1)
+
+                    T.barrier_arrive(bar_S_1_ready)
+
+                    T.gemm(S_shared_1, KV_shared_1_r, acc_o_r, transpose_B=False, wg_wait=-1)
+
+                    # --- Step 4: O_R += P0 @ V0_R (Cross-Attention) ---
+                    T.barrier_wait(bar_S_0_ready, (i_i & 1))
+
+                    T.gemm(S_shared_0, KV_shared_0_r, acc_o_r, transpose_B=False, wg_wait=-1)
+
+                    T.wait_wgmma(1)
+                    T.barrier_arrive(bar_k_1_free[0])
+                    T.wait_wgmma(0)
+                    T.barrier_arrive(bar_k_0_free[0])
+
+                T.copy(sumexp_1, row_sum_shared_1)
+                T.barrier_arrive(bar_stats_1_ready)
+                T.barrier_wait(bar_stats_0_ready, T.ceildiv(NI, 2) & 1)
+                T.copy(row_sum_shared_0, sumexp_i_1)
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_1[h_i] += sumexp_i_1[h_i]
+
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_r[h_i, d_i] /= sumexp_1[h_i]
+
+                T.copy(acc_o_r, O_shared_r)
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
+
+    return main
+
+
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    batch, seq_len, heads, dim_plus_tail_dim = q.shape
+    _, seq_len_kv, kv_group, _ = kv.shape
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = 512
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    assert kv.shape[0] == batch
+    _, _, _, topk = indices.shape
+    assert indices.shape == (batch, seq_len, kv_group, topk)
+
+    if q_start_index_s != 0:
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
+    CP0 = q_start_index_s == 0
+
+    # Compile the kernel
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
+
+    if print_kernel:
+        print(kernel.get_kernel_source())
+
+    if return_kernel:
+        return kernel
+
+    (
+        out,
+        lse,
+    ) = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    if q_start_index_s == 0 and kv_stride > 1:
+        # Set the output of the first (kv_stride - 1) positions to 0, since they cannot see any kv so no computation was performed
+        out[:, : kv_stride - 1, :, :] = 0
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=1, sm_scale=None, is_casual=True):
+    q = q.float()
+    kv = kv.float()
+    indices = indices.transpose(1, 2)
+    b, sq, h, dim_q = q.shape
+    b, sk, g, _ = kv.shape
+    if q_start_index_s is None:
+        q_start_index_s = sk * kv_stride - sq
+
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
+    dim = 512
+    k = kv
+    v = kv[..., :dim]
+
+    b, _, _, dim_v = v.shape
+    num_kv_per_index = 1
+    g_index = g
+    h_index = h // g
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+
+    mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+    mask = mask[..., :-1]
+    mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+    mask[:, :, : kv_stride - 1, 0] = True
+    mask = mask.view(b, g_index, 1, sq, sk)
+
+    q = q.view(b, sq, g, -1, dim_q)
+    score = torch.einsum("bmghd,bngd->bghmn", q, k)
+    sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+    p = score.softmax(dim=-1)
+    p = p.view(b, g_index, h_index, -1, sq, sk)
+    p = p.view(b, g, -1, sq, sk)
+    o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+    o = o.reshape(b, sq, h, dim_v)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd_pipelined(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    # Offset of query in global sequence position (or relative to kv)
+    q_start_s_index=2048,
+    check_correctness=True,
+    profile=False,
+):
+    KV_stride = 1
+
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
+
+    q.clamp_(-10, 10)
+    kv.clamp_(-10, 10)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                # Add offset q_start_s_index to convert to global sequence position
+                i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    print("index generation finished")
+
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+
+    def fn():
+        return kernel(q, kv, indices, q_start_s_index_t)
+
+    if check_correctness:
+        tl_out, tl_lse = fn()
+        assert KV_stride == 1, "KV_stride > 1 not supported"
+        # if q_start_s_index == 0 and KV_stride > 1:
+        #     tl_out[:, :KV_stride - 1, :, :] = 0
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+        print(f"tl_out: {tl_out}")
+        print(f"ref_out: {ref_out}")
+        torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
+
+    if profile:
+        print("Profiling mode: running minimal iterations (1 warmup + 1 run)...")
+        fn()
+        torch.cuda.synchronize()
+        fn()
+        torch.cuda.synchronize()
+        return
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=20,
+        warmup=10,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    tflops = (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12
+    print(f"fwd tflops = {tflops:.2f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_correctness", action="store_true")
+    parser.add_argument("--profile", action="store_true")
+    args = parser.parse_args()
+    if args.test_correctness:
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=True, profile=args.profile)
+    else:
+        # Prefill Benchmark: long context
+        print(" --- Prefill Benchmark --- ")
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 2, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(
+            B, S, SKV, H, HKV, DQK, DV, topk, dtype, q_start_s_index=4096, check_correctness=False, profile=args.profile
+        )
+
+        # Decode Benchmark: large batch size, high throughput generation
+        print("\n --- Decode Benchmark --- ")
+        # Increase batch size to saturate h100 for decode
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 128 * 16, 2, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(
+            B, S, SKV, H, HKV, DQK, DV, topk, dtype, q_start_s_index=2048 + 4096, check_correctness=False, profile=args.profile
+        )
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index e10141b59..983798f9f 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -1,4 +1,5 @@
 # ruff: noqa
+import tilelang
 import tilelang.testing
 
 import topk_selector
@@ -20,23 +21,23 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    sparse_mla_fwd.test_sparse_mla_fwd(
-        S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd.test_sparse_mla_fwd(S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(
-        S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
+    sparse_mla_bwd.test_sparse_mla_bwd(S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
     sparse_mla_bwd.test_sparse_mla_bwd(
-        S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
+        S=256, SKV=512, H=128, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False
+    )  # test for large H
 
 
 if __name__ == "__main__":
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index 4a4b43277..8b29c6fd5 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -8,24 +8,24 @@
 
 
 def convert_to_uint16(x):
-    hval = T.Cast("float16", x)
-    bits_uint = T.reinterpret("uint16", hval)
+    hval = T.cast(x, T.float16)
+    bits_uint = T.reinterpret(hval, T.uint16)
     bits_uint = T.if_then_else(x < 0, ~bits_uint & (0xFFFF), bits_uint | (0x8000))
     return bits_uint >> 8
 
 
 def convert_to_uint32(x):
-    bits_uint = T.reinterpret("uint32", x)
+    bits_uint = T.reinterpret(x, T.uint32)
     bits_uint = T.if_then_else(
         x < 0,
-        ~bits_uint & T.Cast("uint32", (0xFFFFFFFF)),
-        bits_uint | T.Cast("uint32", (0x80000000)),
+        ~bits_uint & T.cast((0xFFFFFFFF), T.uint32),
+        bits_uint | T.cast((0x80000000), T.uint32),
     )
     return bits_uint
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
+def tl_topk_impl(topk, in_dtype=T.float32, out_dtype=T.int32):
     batch = T.dynamic("batch")
     seq_len = T.dynamic("seq_len")
     RADIX = 1 << 8
@@ -42,20 +42,22 @@ def tl_topk_kernel(
         with T.Kernel(batch, threads=BLOCK_SIZE) as (bx):
             tx = T.get_thread_binding()
 
-            s_threshold_bin_id = T.alloc_shared([1], "int32")
-            s_histogram = T.alloc_shared([RADIX + 1], "int32")
-            s_num_input = T.alloc_shared([2], "int32")
-            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], "int32")
-
-            l_threshold_bin_id = T.alloc_var("int32")
-            l_new_topk = T.alloc_var("int32")
-            l_num_input = T.alloc_var("int32")
-            l_bin_id32 = T.alloc_var("int32")
-            l_val = T.alloc_var("int32")
-            l_start_pos = T.alloc_var("int32")
-            l_start_idx = T.alloc_var("int32")
-            l_end_idx = T.alloc_var("int32")
-            l_out_pos = T.alloc_var("int32")
+            s_threshold_bin_id = T.alloc_shared([1], T.int32)
+            s_histogram = T.alloc_shared([RADIX + 1], T.int32)
+            s_num_input = T.alloc_shared([2], T.int32)
+            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], T.int32)
+
+            l_threshold_bin_id = T.alloc_var(T.int32)
+            l_new_topk = T.alloc_var(T.int32)
+            l_num_input = T.alloc_var(T.int32)
+            l_bin_id32 = T.alloc_var(T.int32)
+            l_val = T.alloc_var(T.int32)
+            l_start_pos = T.alloc_var(T.int32)
+            l_start_idx = T.alloc_var(T.int32)
+            l_end_idx = T.alloc_var(T.int32)
+            l_out_pos = T.alloc_var(T.int32)
+
+            pos = T.alloc_var(T.int32)
 
             l_new_topk = topk
             l_start_idx = starts[bx]
@@ -99,7 +101,7 @@ def tl_topk_kernel(
                 input_idx = s * BLOCK_SIZE + tx
                 if input_idx < l_end_idx and input_idx >= l_start_idx and input_idx < seq_len:
                     bin_id = convert_to_uint16(input[bx, input_idx])
-                    l_bin_id32 = T.Cast("int32", bin_id)
+                    l_bin_id32 = T.cast(bin_id, T.int32)
                     if l_bin_id32 > l_threshold_bin_id:
                         # need a pos = T.atomic_add(s_histogram[bin_id32+1], 1)
                         pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True)
@@ -113,7 +115,7 @@ def tl_topk_kernel(
             # stage 2: tail pass
             for round in T.serial(4):
                 if l_new_topk <= 0:
-                    T.loop_break()
+                    break
 
                 r_idx = round % 2
                 l_start_pos = topk - l_new_topk
@@ -127,9 +129,9 @@ def tl_topk_kernel(
                 l_num_input = s_num_input[r_idx]
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.cast(
+                            ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF), T.int32
+                        )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
                 # cumsum
@@ -156,23 +158,20 @@ def tl_topk_kernel(
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.cast(
+                            ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF), T.int32
+                        )
                         if l_bin_id32 > l_threshold_bin_id:
-                            pos = T.atomic_add(
-                                s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                            pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                             index[bx, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                         elif l_bin_id32 == l_threshold_bin_id and l_new_topk > 0:
                             if round == 3:
-                                l_out_pos = T.atomic_add(
-                                    s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                                l_out_pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                                 if l_out_pos < topk:
                                     index[bx, l_out_pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                             else:
                                 pos = T.atomic_add(s_num_input[r_idx ^ 1], 1, return_prev=True)
-                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx,
-                                                                          s * BLOCK_SIZE + tx]
+                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
 
     return tl_topk_kernel
 
@@ -186,10 +185,6 @@ def tl_topk(input, starts, ends, topk):
 
 
 def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
-
-    batch = 64
-    seq_len = 32 * 1024
-    topk = 2048
     torch.manual_seed(1)
     input = torch.randn(batch, seq_len, dtype=torch.float32).cuda()
     starts = torch.zeros(batch, dtype=torch.int32).cuda()
@@ -212,8 +207,7 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
         set_ref = set(ref_np)
         set_trt = set(trt_np)
         intersection = set_ref & set_trt
-        print("selected/all:", len(intersection), "/", len(set_ref), "=",
-              len(intersection) / len(set_ref))
+        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
 
     # Performance test with CUDA events
 
@@ -245,5 +239,19 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
     print(f"Average torch.topk time: {elapsed_time_ms / n_iters:.3f} ms")
 
 
+def run_regression_perf(batch=64, seq_len=32 * 1024, topk=2048):
+    torch.manual_seed(1)
+    input = torch.randn(batch, seq_len, dtype=torch.float32).cuda()
+    starts = torch.zeros(batch, dtype=torch.int32).cuda()
+    ends = torch.ones(batch, dtype=torch.int32).cuda() * seq_len
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        tl_topk(input, starts, ends, topk)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     test_topk_selector()
diff --git a/examples/deepseek_v32/utils.py b/examples/deepseek_v32/utils.py
index 2ea34b14a..d7252e171 100644
--- a/examples/deepseek_v32/utils.py
+++ b/examples/deepseek_v32/utils.py
@@ -23,8 +23,7 @@ def _is_equal(a, b):
     if isinstance(a, torch.Tensor):
         return a is b
     # Whitelist of types that are safe to compare by value for caching.
-    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(
-            b, (int, float, str, bool, type(None))):
+    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(b, (int, float, str, bool, type(None))):
         return a == b
     # For other types, we cannot guarantee a cheap and safe comparison, so we fail the cache check.
     return False
@@ -58,9 +57,11 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
                 # For Tensors, check for object identity. For other types, check for equality.
                 # Python caches small integers, so `is` works for them but not for large integers like 4096.
-                if all(_is_equal(a, b) for a, b in zip(args, last_args)) and \
-                   set(kwargs.keys()) == set(last_kwargs.keys()) and \
-                   all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items()):
+                if (
+                    all(_is_equal(a, b) for a, b in zip(args, last_args))
+                    and set(kwargs.keys()) == set(last_kwargs.keys())
+                    and all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items())
+                ):
                     return last_result
 
         result = fn(*args, **kwargs)
@@ -79,73 +80,68 @@ def cal_seq_idx_from_cu_seqlens(cu_seqlens: torch.LongTensor, seq_len: int):
 
 
 @tensor_cache
-def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                      seq_len: int) -> torch.IntTensor:
-    seq_idx_for_q = torch.full((seq_len,),
-                               len(cu_seqlens_qs),
-                               dtype=torch.int32,
-                               device=cu_seqlens_qs.device)
+def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+    seq_idx_for_q = torch.full((seq_len,), len(cu_seqlens_qs), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        seq_idx_for_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = i
+        seq_idx_for_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = i
     return seq_idx_for_q
 
 
 @tensor_cache
-def cal_cu_seqlen_ks_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+def cal_cu_seqlen_ks_for_q(
+    cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, cu_seqlens_ks: torch.LongTensor, seq_len: int
+) -> torch.IntTensor:
     cu_seqlen_ks_for_each_q = torch.gather(
-        input=torch.cat([
-            cu_seqlens_ks,
-            torch.full((1,),
-                       torch.iinfo(torch.int32).max,
-                       dtype=torch.int32,
-                       device=cu_seqlens_qs.device)
-        ]),
+        input=torch.cat([cu_seqlens_ks, torch.full((1,), torch.iinfo(torch.int32).max, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
     return cu_seqlen_ks_for_each_q.int()
 
 
 @tensor_cache
-def cal_cu_seqlen_ke_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, cu_seqlens_ke: torch.LongTensor,
-                           q_start_idxs: torch.LongTensor, seq_len: int,
-                           kv_stride: int) -> torch.IntTensor:
+def cal_cu_seqlen_ke_for_q(
+    cu_seqlens_qs: torch.LongTensor,
+    cu_seqlens_qe: torch.LongTensor,
+    cu_seqlens_ks: torch.LongTensor,
+    cu_seqlens_ke: torch.LongTensor,
+    q_start_idxs: torch.LongTensor,
+    seq_len: int,
+    kv_stride: int,
+) -> torch.IntTensor:
     cu_seqlen_ke_for_each_q = torch.gather(
-        input=torch.cat(
-            [cu_seqlens_ke,
-             torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        input=torch.cat([cu_seqlens_ke, torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
-    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,),
-                                                 dtype=torch.int32,
-                                                 device=cu_seqlens_qs.device)
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = (torch.arange(
-            q_start_idxs[i],
-            q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i],
-            dtype=torch.int32,
-            device=cu_seqlens_qs.device) + 1) // kv_stride + cu_seqlens_ks[i]
+        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = (
+            torch.arange(
+                q_start_idxs[i], q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i], dtype=torch.int32, device=cu_seqlens_qs.device
+            )
+            + 1
+        ) // kv_stride + cu_seqlens_ks[i]
     cu_seqlen_ke_for_each_q = torch.minimum(casual_cu_seqlen_ke_for_each_q, cu_seqlen_ke_for_each_q)
     return cu_seqlen_ke_for_each_q.int()
 
 
 @tensor_cache
-def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
-                                cu_seqlens_k: torch.LongTensor = None,
-                                offs_q: torch.LongTensor = None,
-                                *,
-                                seq_len: int,
-                                kv_stride: int = 1,
-                                cp_rank: int = 0,
-                                cp_size: int = 1,
-                                balanced_cp=False):
-    '''
+def cal_ks_ke_from_cu_seqlen_qk(
+    cu_seqlens_q: torch.LongTensor,
+    cu_seqlens_k: torch.LongTensor = None,
+    offs_q: torch.LongTensor = None,
+    *,
+    seq_len: int,
+    kv_stride: int = 1,
+    cp_rank: int = 0,
+    cp_size: int = 1,
+    balanced_cp=False,
+):
+    """
     seq_len: seq len per cp rank
     balanced cp slice assignment: 0 1 2 3 3 2 1 0
-    '''
+    """
     n_seq = len(cu_seqlens_q) - 1
     assert n_seq > 0
     assert cu_seqlens_q.shape == (n_seq + 1,)
@@ -170,10 +166,12 @@ def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
 
         def f(x: torch.Tensor):
             chunks = x.chunk(cp_size * 2)
-            return torch.cat([
-                chunks[cp_rank],
-                chunks[cp_size - cp_rank - 1],
-            ])
+            return torch.cat(
+                [
+                    chunks[cp_rank],
+                    chunks[cp_size - cp_rank - 1],
+                ]
+            )
 
         ks = f(ks)
         ke = f(ke)
@@ -189,8 +187,7 @@ def ceil_to_ue8m0(x: torch.Tensor):
     return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
 
 
-def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int],
-                                use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int], use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
     excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
     x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
     sf = x_amax / 448.0
@@ -239,14 +236,18 @@ def generate_random_cu_seqlens(per_cp_seqlen, cp_size=4, cp_rank=3, kv_stride=1,
         total_seqlen - (cp_rank + 1) * per_chunk_seqlen,
         total_seqlen - cp_rank * per_chunk_seqlen,
     )
-    ks = torch.cat([
-        cu_seqlens_ks_for_each_q[slice_short],
-        cu_seqlens_ks_for_each_q[slice_long],
-    ])
-    ke = torch.cat([
-        cu_seqlens_ke_for_each_q[slice_short],
-        cu_seqlens_ke_for_each_q[slice_long],
-    ])
+    ks = torch.cat(
+        [
+            cu_seqlens_ks_for_each_q[slice_short],
+            cu_seqlens_ks_for_each_q[slice_long],
+        ]
+    )
+    ke = torch.cat(
+        [
+            cu_seqlens_ke_for_each_q[slice_short],
+            cu_seqlens_ke_for_each_q[slice_long],
+        ]
+    )
     assert len(ks) == len(ke) == per_cp_seqlen
     return ks, ke
 
@@ -302,11 +303,9 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
         raise_assert: Whether to raise assertion error on failure
     """
     sim = calculate_tensor_similarity(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print(
-            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
-        )
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
         if raise_assert:
             assert False  # noqa: B011
 
@@ -316,11 +315,8 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
     cu_seqlens = torch.randint(128, 4096, (1000,), dtype=torch.int32, device="cuda")
     last_idx = torch.where(cu_seqlens.cumsum(dim=0) >= seq_len)[0][0]
     cu_seqlens_cumsum = cu_seqlens[:last_idx].cumsum(dim=0)
-    cu_seqlens_qs = torch.cat(
-        [torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
-    cu_seqlens_qe = torch.cat(
-        [cu_seqlens_cumsum,
-         torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
+    cu_seqlens_qs = torch.cat([torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
+    cu_seqlens_qe = torch.cat([cu_seqlens_cumsum, torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
 
     from tilelang.profiler import do_bench
 
diff --git a/examples/dequantize_gemm/README.md b/examples/dequantize_gemm/README.md
index 0c6116775..25ef617a2 100644
--- a/examples/dequantize_gemm/README.md
+++ b/examples/dequantize_gemm/README.md
@@ -19,7 +19,7 @@ def dequant_matmul(
 
         T.clear(Ct_local)
         for k in T.Pipelined(
-            T.ceildiv(K, block_K), 
+            T.ceildiv(K, block_K),
             num_stages=num_stages
         ):
             T.copy(A[by * block_M, k * block_K], A_shared)
diff --git a/examples/dequantize_gemm/dequantize_utils.py b/examples/dequantize_gemm/dequantize_utils.py
index b14c0aee6..90a6265ff 100644
--- a/examples/dequantize_gemm/dequantize_utils.py
+++ b/examples/dequantize_gemm/dequantize_utils.py
@@ -39,12 +39,10 @@ def torch_convert_bit_twiddling(tensor):
     res0 = val_concat_expanded & mask
     res1 = (val_concat_expanded << 3) & mask
     res2 = (val_concat_expanded << 6) & mask
-    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | (
-        (val_concat_expanded >> 7) & mask3)
+    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | ((val_concat_expanded >> 7) & mask3)
 
     # Select the correct result based on position
-    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1,
-                                                   torch.where(pos == 2, res2, res3)))
+    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1, torch.where(pos == 2, res2, res3)))
 
     # Convert to uint16 for .view(torch.bfloat16)
     bf16_uint16 = (bf16 & 0xFFFF).to(torch.uint16)
@@ -110,7 +108,7 @@ def print_bit(name, val):
         val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.
     """
     val_cpu = val.cpu().item()
-    binary_repr = f'{val_cpu:032b}'
+    binary_repr = f"{val_cpu:032b}"
     print(name, binary_repr)
 
 
@@ -122,7 +120,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -132,21 +130,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = (1. - sim).item()
-    print(f'{diff=}')
+    diff = (1.0 - sim).item()
+    print(f"{diff=}")
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff=}')
+        print_red_warning(f"{name} Error: {diff=}")
         if raise_assert:
             raise AssertionError
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index e30845b8d..36b32c0a8 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -24,6 +24,7 @@ def get_configs():
         the parameter name to its chosen value.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -32,65 +33,64 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           fast_dequant=True,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    fast_dequant=True,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
+    """
+    Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
+
+    This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
+    - A: dense input of shape (M, K) with dtype `in_dtype`.
+    - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
+    - C: output of shape (M, N) with dtype `out_dtype`.
+
+    The generated kernel supports two dequantization paths:
+    - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
+    - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
+
+    Important behavior and requirements:
+    - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
+    - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
+    - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
+    - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
+    - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
+
+    Parameters that alter kernel layout/behavior (brief):
+    - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
+    - num_stages: number of software pipeline stages for the K-loop.
+    - threads: number of threads used per kernel block.
+    - split: extra K-splitting factor; K must be divisible by block_K * split.
+    - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
+
+    Returns:
+        A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
     """
-           Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
-
-           This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
-           - A: dense input of shape (M, K) with dtype `in_dtype`.
-           - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
-           - C: output of shape (M, N) with dtype `out_dtype`.
-
-           The generated kernel supports two dequantization paths:
-           - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
-           - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
-
-           Important behavior and requirements:
-           - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
-           - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
-           - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
-           - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
-           - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
-
-           Parameters that alter kernel layout/behavior (brief):
-           - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
-           - num_stages: number of software pipeline stages for the K-loop.
-           - threads: number of threads used per kernel block.
-           - split: extra K-splitting factor; K must be divisible by block_K * split.
-           - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
-
-           Returns:
-               A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
-           """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
 
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
@@ -121,7 +121,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a TileLang macro that performs fast, twiddling-based dequantization from packed FP4 to BF16 using an external runtime plugin.
 
@@ -131,13 +131,13 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the dequantized BF16 values back to a shared dequantized buffer for use by the kernel.
 
         Notes and preconditions:
-        - Asserts that `in_dtype == "fp4"` and `out_dtype == "bfloat16"`.
+        - Asserts that `in_dtype == "fp4"` and `out_dtype == T.bfloat16`.
         - The generated macro depends on several surrounding-scope symbols (e.g., `import_source`, `func_name`, `block_K`, `Block_QK`, `threads`, `num_elems_per_byte`, `storage_dtype`, and `out_dtype`) and expects them to be defined consistently in the enclosing kernel.
         - The macro is optimized for block-wise, per-thread transactions sized to the target storage width (uses a MAX_TRANSACTION_SIZE_BITS constant) and uses local/register buffers sized accordingly.
         - The macro uses `T.import_source` to bring the external plugin into the module and `T.call_extern` to perform the high-throughput dequantization; callers must ensure the external function matches the expected calling convention and memory layout.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -189,12 +189,11 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared):
                 # Finally, store the dequantized data to shared memory.
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple TIR dequantization macro that converts packed 4-bit FP (FP4) stored in uint8 into bfloat16.
 
@@ -205,7 +204,7 @@ def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the dequantized bfloat16 block into B_dequantize_shared.
 
         Constraints:
-        - Supports only in_dtype="fp4" and out_dtype="bfloat16".
+        - Supports only in_dtype="fp4" and out_dtype=T.bfloat16.
         - The helper assumes nbit == 4 and produces bfloat16 values.
         - The macro uses a fixed test-scale of 0 (no per-element scaling) as written.
 
@@ -213,49 +212,49 @@ def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
             A TIR macro function performing the described in-place block dequantization from packed uint8 FP4 to bfloat16.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
-        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr,
-                                  scale: tir.PrimExpr, dtype: str):
+        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
             """
-                Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
-
-                This helper extracts the 4-bit field located at the bit position `pos` within the
-                byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
-                exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
-                resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
-
-                Parameters:
-                    nbit (int): Number of bits in the packed element; must be 4.
-                    val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
-                    pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
-                    scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
-                    dtype (str): Target dtype string; must be "bfloat16".
-
-                Returns:
-                    tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
-
-                Notes:
-                    - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
-                    - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
-                    bit fields and clamps the computed exponent to fit into 8 bits.
+            Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
+
+            This helper extracts the 4-bit field located at the bit position `pos` within the
+            byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
+            exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
+            resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
+
+            Parameters:
+                nbit (int): Number of bits in the packed element; must be 4.
+                val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
+                pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
+                scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
+                dtype (str): Target dtype string; must be T.bfloat16.
+
+            Returns:
+                tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
+
+            Notes:
+                - The function asserts `nbit == 4`, `dtype == T.bfloat16`, and that `val.dtype` is T.uint8.
+                - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
+                bit fields and clamps the computed exponent to fit into 8 bits.
             """
             assert nbit == 4
-            assert dtype == "bfloat16"
-            assert val.dtype == "uint8"
-            mask = tir.const((1 << nbit) - 1, "uint16")
-            f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-            s = f4 >> tir.const(3, "uint16")
-            e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+            assert dtype == T.bfloat16
+            assert val.dtype == T.uint8
+            mask = tir.const((1 << nbit) - 1, T.uint16)
+            f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+            s = f4 >> tir.const(3, T.uint16)
+            e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
             # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-            e_bf16 = e_f4 + tir.const(126, "uint16")
+            e_bf16 = e_f4 + tir.const(126, T.uint16)
             # Scale is the exponential part, within the representation of uint8
             # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-            m_f4 = f4 & tir.const(1, "uint16")
+            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+            m_f4 = f4 & tir.const(1, T.uint16)
             val_bf16 = tir.reinterpret(
-                "bfloat16", ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                             | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+                T.bfloat16,
+                ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+            )
             return val_bf16
 
         @T.macro
@@ -292,32 +291,32 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared):
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
-
-            This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
-            - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
-            - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
-            - Pipelines over K in chunks of `block_K` for `num_stages` stages:
-              - Loads A and packed B tiles into shared memory.
-              - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
-              - Performs a GEMM accumulating into C_local with B transposed.
-            - Stores the accumulated block from C_local back to the global output C via C_shared.
-
-            Parameters:
-            - A: input tile of shape (M, K) with dtype `in_dtype`.
-            - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
-            - C: output tensor of shape (M, N) with dtype `out_dtype`.
-
-            Side effects:
-            - Writes the computed output block into the global tensor `C`.
-            - Uses and updates shared memory buffers and per-thread accumulators.
-
-            No value is returned.
+        Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
+
+        This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
+        - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
+        - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
+        - Pipelines over K in chunks of `block_K` for `num_stages` stages:
+          - Loads A and packed B tiles into shared memory.
+          - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
+          - Performs a GEMM accumulating into C_local with B transposed.
+        - Stores the accumulated block from C_local back to the global output C via C_shared.
+
+        Parameters:
+        - A: input tile of shape (M, K) with dtype `in_dtype`.
+        - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
+        - C: output tensor of shape (M, N) with dtype `out_dtype`.
+
+        Side effects:
+        - Writes the computed output block into the global tensor `C`.
+        - Uses and updates shared memory buffers and per-thread accumulators.
+
+        No value is returned.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -327,10 +326,6 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
-
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
@@ -344,7 +339,7 @@ def main(
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -363,7 +358,7 @@ def ref_program_twiddling(A, qB):
     Returns:
         torch.Tensor: Result matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -383,7 +378,7 @@ def ref_program_simple(A, qB):
     Returns:
         torch.Tensor: Resulting matrix C in bfloat16 with shape (M, N).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -409,16 +404,15 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     """
     total_flops = 2 * m * n * k
     if tune:
-        kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
+        kernel = matmul(m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, fast_dequant=fast_dequant)
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             fast_dequant=fast_dequant,
             block_M=256,
@@ -426,7 +420,8 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
             block_K=128,
             num_stages=2,
             threads=256,
-            split=1)
+            split=1,
+        )
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
     if fast_dequant:
         profiler.assert_allclose(ref_program_twiddling, rtol=0.01, atol=0.01)
@@ -437,6 +432,27 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, fast_dequant=True):
+    kernel = matmul(
+        m,
+        n,
+        k,
+        "bfloat16",
+        "bfloat16",
+        "float32",
+        num_bits=4,
+        fast_dequant=fast_dequant,
+        block_M=256,
+        block_N=128,
+        block_K=128,
+        num_stages=2,
+        threads=256,
+        split=1,
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main(256, 256, 256, True)
     main(256, 256, 256, False)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index ac1417aeb..cc37c8bc4 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -7,45 +7,45 @@
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,70 +74,74 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -164,7 +170,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -175,12 +181,12 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -252,24 +258,23 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale, k):
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
@@ -301,33 +306,32 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale[
-                        bx * block_N + i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        bx * block_N + i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
-                ) * T.shift_left(
-                    1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
+                ) * T.shift_left(1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
 
         return simple_dequant_bf16_fp4
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -337,23 +341,24 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
             if with_bias:
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       Bias_shared)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], Bias_shared)
                 T.copy(Bias_shared, C_local)
             else:
                 T.clear(C_local)
@@ -368,7 +373,7 @@ def main(
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -387,9 +392,9 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -410,9 +415,9 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -434,9 +439,9 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -462,9 +467,9 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -491,24 +496,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
@@ -518,7 +515,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
@@ -538,6 +536,29 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, scale_size=32, fast_dequant=True, with_bias=False):
+    kernel = matmul(
+        m,
+        n,
+        k,
+        "bfloat16",
+        "bfloat16",
+        "float32",
+        num_bits=4,
+        scale_size=scale_size,
+        block_M=256,
+        block_N=128,
+        block_K=128,
+        num_stages=2,
+        threads=256,
+        split=1,
+        fast_dequant=fast_dequant,
+        with_bias=with_bias,
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     M, N, K = 256, 256, 256
     scale_size = 32
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
deleted file mode 100644
index 7dad79597..000000000
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ /dev/null
@@ -1,563 +0,0 @@
-import tilelang
-import tilelang.language as T
-from tilelang import tvm as tvm
-from tvm import DataType
-from tvm import tir
-import torch
-from dequantize_utils import torch_convert_bit_twiddling, torch_convert
-
-
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
-    """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
-
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
-
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
-
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
-
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
-    assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
-    # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
-    # Scale is the exponential part, within the representation of uint8
-    # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
-    # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
-    return val_bf16
-
-
-def get_configs():
-    """
-    Generate a list of hyperparameter configuration dictionaries for tuning.
-
-    Each configuration is a dict with keys: 'block_M', 'block_N', 'block_K',
-    'num_stages', 'threads', and 'split'. The function returns the Cartesian
-    product of the parameter value lists:
-    - block_M, block_N, block_K: tiling sizes (64, 128, 256)
-    - num_stages: pipeline stages (0, 2)
-    - threads: thread counts (128, 256, 512)
-    - split: K-splitting factor (1, 2)
-
-    Returns:
-        List[dict]: A list of configuration dictionaries covering all combinations.
-    """
-    import itertools
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[64, 128, 256],
-        block_K=[64, 128, 256],
-        num_stages=[0, 1, 2],
-        threads=[128, 256, 512],
-        split=[1, 2],
-    )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
-    """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
-
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
-    """
-    num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
-    QK = K // num_elems_per_byte
-    Block_QK = block_K // num_elems_per_byte
-    A_shape = (M, K)
-    B_shape = (N, QK)
-    Bias_shape = (M, N)
-    Scale_shape = (N, K // scale_size)
-    A_shared_shape = (block_M, block_K)
-    B_shared_shape = (block_N, Block_QK)
-    Bias_shared_shape = (block_M, block_N)
-    B_dequantize_shared_shape = (block_N, block_K)
-    assert K % (block_K * split) == 0
-
-    from tilelang.quantize import get_mxfp_intrin_group
-    # fast_dequant_bf16_fp4_twiddling
-    mxfp_intrin_info = get_mxfp_intrin_group(
-        out_dtype=in_dtype,
-        source_format=source_format,
-        source_bit=num_bits,
-        storage_dtype=storage_dtype,
-        use_twiddling=True,
-    )
-    import_source = mxfp_intrin_info["c_source"]
-    func_name = mxfp_intrin_info["func_name"]
-    assert import_source is not None, "mxfp_intrin_info is not found"
-    assert func_name is not None, "mxfp_intrin_info is not found"
-    import_source = import_source
-
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
-        """
-        Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
-
-        The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
-        - Loads packed FP4 elements from B_shared into per-thread local registers.
-        - Calls an external fast dequantization intrinsic (provided via `import_source` / `func_name` in the outer scope) to expand packed FP4 -> BF16 values.
-        - Applies a per-block scale factor derived from the Scale tensor (using exponentiation by powers of two).
-        - Writes the scaled BF16 results into B_dequantize_shared.
-
-        Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
-        - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
-        - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
-        """
-        assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
-
-        # Some variables for dequantization in each thread
-        MAX_TRANSACTION_SIZE_BITS = 128
-        local_size = MAX_TRANSACTION_SIZE_BITS // DataType(out_dtype).bits
-        local_compress_size = local_size // num_elems_per_byte
-
-        @T.macro
-        def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale_shared, k):
-            # import fast_dequantize plugin
-            """
-            Fast dequantization kernel: convert packed 4-bit quantized values in B_shared to bfloat16
-            in B_dequantize_shared using an external intrinsic optimized for twiddled (bit-packed) FP4,
-            applying per-block scale factors from Scale.
-
-            This routine is a tiled, thread-parallel helper that:
-            - Imports and calls an external dequantization function (via `import_source`/`func_name`)
-              to expand compressed uint8-packed FP4 values into BF16 fragments in-thread.
-            - Loads the corresponding per-block scale entry, interprets it as an exponent bias
-              (applies 2^(Scale - 127)), and multiplies the dequantized BF16 fragment by that factor.
-            - Writes the scaled BF16 results back into the shared B_dequantize_shared buffer in-place.
-
-            Parameters:
-            - B_shared: read-only shared buffer containing compressed FP4 data (packed uint8 layout).
-            - B_dequantize_shared: shared output buffer that is overwritten with BF16 dequantized values.
-            - Scale: per-block scale tensor; entries are interpreted such that the multiplicative scale
-              = 2^(Scale - 127).
-            - k: block index along the K dimension used to select the appropriate Scale entries.
-
-            Side effects:
-            - Mutates B_dequantize_shared in shared memory.
-            - Calls an external intrinsic function (must be provided by the environment via `import_source`
-              and `func_name`) to perform the low-level unpacking/dequantization.
-            """
-            T.import_source(import_source)
-
-            tx = T.get_thread_binding()
-            bx = T.get_block_binding(0)  # noqa: F841
-
-            B_local_thread = T.alloc_local((local_compress_size,), storage_dtype)
-            B_dequantize_local_thread = T.alloc_local((local_size,), out_dtype)
-            Scale_local_thread = T.alloc_local((1,), storage_dtype)
-            Scale_local_thread_exponent = T.alloc_local((1,), out_dtype)
-
-            for i in T.serial(0, block_N * block_K // threads // local_size):
-                # First, load data from share memory to register.
-                # Prepare for dequant.
-                index_base = i * threads * local_compress_size + tx * local_compress_size
-                for v in T.vectorized(0, local_compress_size):
-                    index = index_base + v
-                    B_local_thread[v] = B_shared[index // Block_QK, index % Block_QK]
-                index_scale = index_base // (scale_size // num_elems_per_byte)
-                si = index_scale // (block_K // scale_size)
-                sj = index_scale % (block_K // scale_size)
-                Scale_local_thread[0] = Scale_shared[si, k * block_K // scale_size + sj]
-                Scale_local_thread_exponent[0] = T.shift_left(1, (Scale_local_thread[0]))
-
-                # Then, dequant.
-                T.call_extern(
-                    func_name,
-                    T.address_of(B_local_thread[0]),
-                    T.address_of(B_dequantize_local_thread[0]),
-                    1,
-                    dtype=out_dtype,
-                )
-
-                # Finally, store the dequantized data to shared memory.
-                for v in T.Parallel(local_size):
-                    B_dequantize_local_thread[v] *= Scale_local_thread_exponent[0]
-
-                for v in T.vectorized(0, local_size):
-                    index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
-
-        return fast_dequant_bf16_fp4_twiddling
-
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
-        """
-        Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
-
-        Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
-
-        Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
-        - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
-        - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
-        """
-        assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
-
-        @T.macro
-        def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
-            """
-            Dequantizes a packed 4-bit (FP4) block from B_shared into BF16 values in B_dequantize_shared using per-element scale exponents.
-
-            Per-element behavior:
-            - Reads packed 4-bit entries from B_shared (uint8 storage, multiple nibbles per byte).
-            - Uses Scale to obtain an exponent term (stored as uint8) and reconstructs BF16 values via _tir_u8_to_f4_to_bf16.
-            - Writes the dequantized BF16 block into B_dequantize_shared.
-
-            Parameters:
-            - B_shared: shared-memory buffer holding packed 4-bit values (uint8-packed layout).
-            - B_dequantize_shared: shared-memory buffer to receive dequantized BF16 results.
-            - Scale: per-element exponent buffer; used to compute the scale factor for each dequantized element.
-            - k: current block index along the K dimension (used to select the appropriate slice of Scale).
-
-            Side effects:
-            - Mutates B_dequantize_shared by storing the dequantized BF16 fragment.
-            """
-            B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
-            B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
-
-            bx = T.get_block_binding(0)  # noqa: F841
-            T.copy(B_shared, B_local)
-            for i, j in T.Parallel(block_N, block_K):
-                B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
-                    num_bits,
-                    B_local[i, j // num_elems_per_byte],
-                    j % num_elems_per_byte,
-                    Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
-                    dtype=out_dtype,
-                ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
-            T.copy(B_dequantize_local, B_dequantize_shared)
-
-        return simple_dequant_bf16_fp4
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
-
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
-
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
-        """
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
-            B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
-            Bias_shared = T.alloc_shared(Bias_shared_shape, out_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
-            # To use 1D TMA, the last dim of Scale_shared must have stride=1
-            # May use much more shared memory than necessary
-            Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
-
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
-
-            if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
-
-            if threads == 512:
-                T.disable_warp_group_reg_alloc()
-
-            if with_bias:
-                # T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                #        Bias_shared)
-                # T.copy(Bias_shared, C_local)
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       C_local)
-            else:
-                T.clear(C_local)
-
-            # Use 1D TMA to load Scale
-            T.copy(Scale[bx * block_N:(bx + 1) * block_N, :], Scale_shared)
-
-            for k in T.Pipelined(K // block_K, num_stages=num_stages):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
-                if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
-                else:
-                    get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
-                T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
-
-            T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
-
-    return main
-
-
-def ref_program_twiddling(A, qB, Scale, Bias=None):
-    """
-    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
-
-    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
-
-    Parameters:
-        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
-        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
-        Scale (torch.Tensor): Per-column-group scale values; Scale indices correspond to groups of 32 columns in B.
-
-    Returns:
-        torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
-    """
-    dtypeC = "bfloat16"
-    B = torch_convert_bit_twiddling(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
-    C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
-    C = C.to(torch.__getattribute__(dtypeC))
-    return C
-
-
-def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
-    """
-    Compute A @ B^T where B is reconstructed from bit-twiddled 4-bit quantized data and per-block scales, returning bfloat16 results.
-
-    Converts the quantized matrix `qB` to floating-point via `torch_convert_bit_twiddling`, applies a per-element scale factor of 2^(Scale - 127) (where Scale indexes are grouped by 32 columns of B), computes the matrix product A · B^T in float, and casts the result to bfloat16.
-
-    Parameters:
-        A (torch.Tensor): Left operand with shape (M, K), used in floating precision.
-        qB (torch.Tensor): Quantized representation of B (packed 4-bit values) compatible with torch_convert_bit_twiddling.
-        Scale (torch.Tensor): Per-column-group scale values; Scale indices correspond to groups of 32 columns in B.
-        Bias (torch.Tensor): Bias tensor with shape (M, N).
-
-    Returns:
-        torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
-    """
-    dtypeC = "bfloat16"
-    B = torch_convert_bit_twiddling(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
-    C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
-    C = C.to(torch.__getattribute__(dtypeC))
-    return C
-
-
-def ref_program_simple(A, qB, Scale, Bias=None):
-    """
-    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
-
-    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
-
-    Parameters:
-    - A: 2D tensor representing the left operand (will be cast to float32 for the matmul).
-    - qB: Quantized representation of B accepted by `torch_convert`.
-    - Scale: 2D tensor of exponent offsets; Scale[i][g] is applied to columns j where g == j // 32.
-
-    Returns:
-    - 2D bfloat16 tensor C containing the matrix product A · B^T.
-
-    No in-place modification is performed on inputs (a local floating copy of B is scaled).
-    """
-    dtypeC = "bfloat16"
-    B = torch_convert(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
-    C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
-    C = C.to(torch.__getattribute__(dtypeC))
-    return C
-
-
-def ref_program_simple_with_bias(A, qB, Scale, Bias):
-    """
-    Compute a BF16 matrix product A · B^T from a quantized B with simple (non-twiddling) dequantization.
-
-    Converts the quantized tensor `qB` to floating B via `torch_convert`, applies a per-element scale factor computed as 2^(Scale[i][j//32] - 127) (Scale supplies exponent offsets in 32-column groups), then computes C = A · B^T and returns the result converted to bfloat16.
-
-    Parameters:
-
-    Returns:
-    - A: 2D tensor representing the left operand (will be cast to float32 for the matmul).
-    - qB: Quantized representation of B accepted by `torch_convert`.
-    - Scale: 2D tensor of exponent offsets; Scale[i][g] is applied to columns j where g == j // 32.
-    - Bias: 2D tensor representing the Bias (will be cast to float32 for the matmul).
-
-
-    Returns:
-    - 2D bfloat16 tensor C containing the matrix product A · B^T.
-
-    No in-place modification is performed on inputs (a local floating copy of B is scaled).
-    """
-    dtypeC = "bfloat16"
-    B = torch_convert(qB)
-    for i in range(B.shape[0]):
-        for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
-    C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
-    C = C.to(torch.__getattribute__(dtypeC))
-    return C
-
-
-def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False, tune=False):
-    """
-    Run and validate the tiled quantized matmul kernel, then benchmark its latency and report TFLOPS.
-
-    Builds a matmul kernel for the given matrix sizes and quantization scale size. If `tune` is True the kernel is obtained via the autotuning path; otherwise a fixed-parameter kernel is used. Validates numerical correctness against the appropriate reference implementation (bit-twiddling reference when `fast_dequant` is True, plain reference otherwise) with rtol/atol=0.01, prints a confirmation, then runs a benchmark (500 warmup iterations) and prints the measured latency (ms) and achieved TFLOPS.
-
-    Parameters:
-        m (int): Number of rows of A / output rows. Default 256.
-        n (int): Number of columns of B / output columns. Default 256.
-        k (int): Reduction dimension. Default 256.
-        scale_size (int): Size of the per-block scale vector used for dequantization. Default 32.
-        fast_dequant (bool): If True validate against the twiddling (fast dequant) reference and exercise the fast dequant path; otherwise use the simple dequant reference. Default True.
-        tune (bool): If True obtain a tuned/autotuned kernel; otherwise use a fixed-parameter kernel. Default False.
-
-    Returns:
-        None
-    """
-    total_flops = 2 * m * n * k
-
-    if tune:
-        kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
-    else:
-        kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            block_M=256,
-            block_N=128,
-            block_K=128,
-            num_stages=2,
-            threads=256,
-            split=1,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
-
-    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
-
-    if fast_dequant:
-        if with_bias:
-            profiler.assert_allclose(ref_program_twiddling_with_bias, rtol=0.01, atol=0.01)
-        else:
-            profiler.assert_allclose(ref_program_twiddling, rtol=0.01, atol=0.01)
-    else:
-        if with_bias:
-            profiler.assert_allclose(ref_program_simple_with_bias, rtol=0.01, atol=0.01)
-        else:
-            profiler.assert_allclose(ref_program_simple, rtol=0.01, atol=0.01)
-    print("All checks pass.")
-    latency = profiler.do_bench(warmup=500)
-    print("Tile-lang: {:.2f} ms".format(latency))
-    print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
-
-
-if __name__ == "__main__":
-    M, N, K = 256, 256, 256
-    scale_size = 32
-    main(M, N, K, scale_size, fast_dequant=True, with_bias=True)
-    main(M, N, K, scale_size, fast_dequant=False, with_bias=True)
-    main(M, N, K, scale_size, fast_dequant=True, with_bias=False)
-    main(M, N, K, scale_size, fast_dequant=False, with_bias=False)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index 727d6d3b6..37826874b 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -24,8 +24,9 @@ def matmul(
     num_bits=4,
 ):
     from tilelang.quantize import _tir_packed_to_unsigned_convert
+
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     storage_type = str("".join(c for c in storage_dtype if not c.isdigit()))
     A_shape = (M, K)
@@ -39,9 +40,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -58,21 +59,19 @@ def main(
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
                         index = i * threads * local_size_compressed + tx * local_size_compressed + v
                         vi = index // (block_K // num_elems_per_byte)
                         vj = index % (block_K // num_elems_per_byte)
                         B_local[v] = B_shared[vi, vj]
                     for v in T.serial(0, local_size):
-                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(
-                            storage_type, storage_nbit)(
-                                num_bits,
-                                B_local[v // num_elems_per_byte],
-                                v % num_elems_per_byte,
-                                dtype=in_dtype,
-                            )
+                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(storage_type, storage_nbit)(
+                            num_bits,
+                            B_local[v // num_elems_per_byte],
+                            v % num_elems_per_byte,
+                            dtype=in_dtype,
+                        )
                     for v in T.vectorized(0, local_size):
                         index = i * threads * local_size + tx * local_size + v
                         vi = index // block_K
@@ -121,9 +120,7 @@ def run_gemm(
     def ref_program(A, qB):
         import torch
 
-        B = (
-            torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                        dtype=torch.half).to(torch.half).to(A.device))
+        B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
                 B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -146,25 +143,27 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 ):
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
     from tilelang.intrinsics.mma_macro_generator import (
-        TensorCoreIntrinEmitterWithLadderTransform,)
+        TensorCoreIntrinEmitterWithLadderTransform,
+    )
 
     from bitblas.gpu.intrin.lop3 import decode_i4_to_f16
+
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -183,7 +182,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     block_M = block_row_warps * warp_row_tiles
     block_N = block_col_warps * warp_col_tiles
-    block_K = 32 if in_dtype == "float16" else 64
+    block_K = 32 if in_dtype == T.float16 else 64
     chunk = block_K // reduce_k
 
     is_smooth_a = False
@@ -192,8 +191,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     pad_factor = 8
 
     A_shape = (M, K)
-    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y,
-               micro_size_k // num_elems_per_byte)
+    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y, micro_size_k // num_elems_per_byte)
     A_shared_shape = (block_M, (block_K + pad_factor) if apply_pad_a else block_K)
     B_shared_shape = (
         block_N // micro_size_y,
@@ -228,7 +226,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
         chunk=chunk,
         reduce_k=reduce_k,
         transform_kind_b=transform_b,
-        num_elems_per_byte=num_elems_per_byte)
+        num_elems_per_byte=num_elems_per_byte,
+    )
 
     vec_load_qb = 16
     if block_N * (block_K // reduce_k) // num_elems_per_byte // threads < vec_load_qb:
@@ -236,14 +235,11 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads,
-                prelude=decode_i4_to_f16) as (bx, by):
-
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads, prelude=decode_i4_to_f16) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -255,40 +251,36 @@ def main(
             thread_binding = T.get_thread_binding(0)
             rk = T.get_thread_binding(1)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             T.use_swizzle(panel_size=10)
 
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, (block_K // reduce_k)):
                     vk = rk * (block_K // reduce_k) + k
                     A_shared[i, vk] = A[by * block_M + i, ko * block_K + vk]
 
                 # TODO(lei): Layout Inference Pass is not efficient to handle the four dims int8 load
-                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
-                                  (threads * vec_load_qb)):
+                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte // (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
                         t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
-                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (
-                            block_K // micro_size_k)
-                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y //
-                              (block_K // micro_size_k)) % (
-                                  block_N // micro_size_y)
-                        B_shared[vj, vk, vjj,
-                                 vkk] = B[bx * (block_N // micro_size_y) + vj,
-                                          ko * (block_K // micro_size_k) + vk, vjj, vkk]
+                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (block_K // micro_size_k)
+                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y // (block_K // micro_size_k)) % (
+                            block_N // micro_size_y
+                        )
+                        B_shared[vj, vk, vjj, vkk] = B[bx * (block_N // micro_size_y) + vj, ko * (block_K // micro_size_k) + vk, vjj, vkk]
 
                 for ki in T.serial(0, (block_K // (micro_size_k * reduce_k))):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -307,9 +299,13 @@ def main(
 
                     for j in T.serial(warp_cols):
                         local_size_b = mma_emitter.local_size_b
-                        T.call_extern('handle', 'decode_i4u_to_f16',
-                                      T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
-                                      T.address_of(B_dequantize_local[j * local_size_b]), 8)
+                        T.call_extern(
+                            "handle",
+                            "decode_i4u_to_f16",
+                            T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
+                            T.address_of(B_dequantize_local[j * local_size_b]),
+                            8,
+                        )
 
                     mma_emitter.mma(A_local, B_dequantize_local, C_local)
 
@@ -328,7 +324,8 @@ def main(
                             reduced_accum_res[0],
                             rk,
                             dtype="handle",
-                        ))
+                        )
+                    )
                     if rk == 0:
                         C_local[n] = reduced_accum_res[0]
 
@@ -340,9 +337,9 @@ def main(
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):
                 vj = rk * (block_N // reduce_k) + j
-                C[by * block_M + i,
-                  bx * block_N + vj] = C_shared[i // micro_size_x, vj // micro_size_y,
-                                                i % micro_size_x, vj % micro_size_y]
+                C[by * block_M + i, bx * block_N + vj] = C_shared[
+                    i // micro_size_x, vj // micro_size_y, i % micro_size_x, vj % micro_size_y
+                ]
 
     return main
 
@@ -357,8 +354,8 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     transform_b,
 ):
     import bitblas
-    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
-        M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
+
+    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
 
     kernel = tilelang.compile(matmul, out_idx=[2])
     src_code = kernel.get_kernel_source()
@@ -368,11 +365,10 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     assert src_code is not None
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
     C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
 
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
@@ -407,9 +403,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     # Ensure that the latency is not None
     assert latency is not None
 
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
             B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -423,14 +417,13 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
 
 @tilelang.testing.requires_package("bitblas")
 def test_run_dequantize_gemm():
-    run_gemm(256, 256, 256, "float16", "float16", "float16", 128, 128, 32, num_threads=128)
-    run_gemm(256, 256, 256, "int8", "int32", "int32", 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.float16, T.float16, T.float16, 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.int8, T.int32, T.int32, 128, 128, 32, num_threads=128)
 
 
 @tilelang.testing.requires_package("bitblas")
 def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
-    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(
-        256, 1024, 512, "float16", "float16", "float16", 3)
+    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, T.float16, T.float16, T.float16, 3)
 
 
 def main():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index c5588d516..2bdcbb068 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -9,30 +9,29 @@
 
 def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint8"
+    assert dtype == T.float16
+    assert val.dtype == T.uint8
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + ExponentialBias(f16, f4) = e_f4 + (2^4 - 2^1) = e_f4 + 14
     # s1e2m1
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
-    e_f16 = e_f4 + tir.const(14, "uint16")
-    m_f4 = f4 & tir.const(1, "uint16")
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
+    e_f16 = e_f4 + tir.const(14, T.uint16)
+    m_f4 = f4 & tir.const(1, T.uint16)
     m_f16 = m_f4
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")
-                               | m_f16 << tir.const(9, "uint16")).astype("uint16"))
-    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
+    val_f16 = tir.reinterpret(
+        T.float16, ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16) | m_f16 << tir.const(9, T.uint16)).astype(T.uint16)
+    )
+    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, T.float16), val_f16)
     return val_f16
 
 
 def torch_convert(tensor):
-
     def print_bit(name, val):
         val_cpu = val.cpu().item()
-        binary_repr = f'{val_cpu:032b}'
+        binary_repr = f"{val_cpu:032b}"
         print(name, binary_repr)
 
     def _convert(val, pos):
@@ -61,15 +60,15 @@ def _convert(val, pos):
 @tilelang.jit(out_idx=[1])
 def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -99,7 +98,7 @@ def test_fp4_fp16_convert_close():
         K,
         block_N,
         block_K,
-        "float16",
+        T.float16,
     )
 
     B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
@@ -118,23 +117,15 @@ def get_configs():
     splits = [1]
     _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
 
-    configs = [{
-        'block_M': c[0],
-        'block_N': c[1],
-        'block_K': c[2],
-        'num_stages': c[3],
-        'threads': c[4],
-        'split': c[5]
-    } for c in _configs]
+    configs = [{"block_M": c[0], "block_N": c[1], "block_K": c[2], "num_stages": c[3], "threads": c[4], "split": c[5]} for c in _configs]
     return configs
 
 
 def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -145,29 +136,24 @@ def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
 
         @T.prim_func
         def main_split(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            SplitC = T.alloc_buffer([
-                split, (N + block_N - 1) // block_N * block_N,
-                (M + block_M - 1) // block_M * block_M
-            ], out_dtype)
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
-                    threads=threads) as (bx, by, bz):
+            SplitC = T.alloc_buffer([split, (N + block_N - 1) // block_N * block_N, (M + block_M - 1) // block_M * block_M], out_dtype)
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split, threads=threads) as (bx, by, bz):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
                 B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
                 B_dequantize_prev_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
-                Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
@@ -183,8 +169,7 @@ def main_split(
                         )
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
-                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
-                                        by * block_M:(by + 1) * block_M])
+                T.copy(Ct_local, SplitC[bz, bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
                 acc = T.alloc_fragment((block_N, block_M), out_dtype)
                 T.clear(acc)
@@ -195,12 +180,11 @@ def main_split(
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -209,10 +193,11 @@ def main(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -229,8 +214,7 @@ def main(
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         if split == 1:
             return main
@@ -241,12 +225,7 @@ def main(
 
         @autotune(configs=get_configs(), warmup=10, rep=10)
         @tilelang.jit(out_idx=[2])
-        def kernel(block_M=None,
-                   block_N=None,
-                   block_K=None,
-                   num_stages=None,
-                   threads=None,
-                   split=None):
+        def kernel(block_M=None, block_N=None, block_K=None, num_stages=None, threads=None, split=None):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split).prim_func
 
         return kernel()
@@ -259,7 +238,7 @@ def kernel(block_M, block_N, block_K, num_stages, threads, split=1):
 
 
 def ref_program(A, qB):
-    dtypeC = "float16"
+    dtypeC = T.float16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -269,10 +248,10 @@ def ref_program(A, qB):
 def main(m=256, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
 
-    if (not tune):
-        kernel = matmul(
-            m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
-                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+    if not tune:
+        kernel = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)(
+            block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
@@ -283,7 +262,7 @@ def main(m=256, n=256, k=256, tune=False):
         print("Tile-lang: {:.2f} ms".format(latency))
         print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
     else:
-        best_result = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)
+        best_result = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Best latency: {best_latency}")
@@ -291,12 +270,20 @@ def main(m=256, n=256, k=256, tune=False):
         print(f"Best config: {best_config}")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096):
+    kernel = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=False)(
+        block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--m', type=int, default=256, help='M')
-    parser.add_argument('--n', type=int, default=256, help='N')
-    parser.add_argument('--k', type=int, default=256, help='K')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--m", type=int, default=256, help="M")
+    parser.add_argument("--n", type=int, default=256, help="N")
+    parser.add_argument("--k", type=int, default=256, help="K")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     M, N, K = args.m, args.n, args.k
     main(M, N, K, args.tune)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index 52ee8216f..b1f8b1132 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -9,15 +9,15 @@
 
 def _tir_u8_to_i4_to_i8(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "int8"
-    assert val.dtype == "uint8"
+    assert dtype == T.int8
+    assert val.dtype == T.uint8
 
-    mask = tir.const((1 << nbit) - 1, "uint8")
+    mask = tir.const((1 << nbit) - 1, T.uint8)
 
-    i4 = (val >> (pos.astype("uint8") * tir.const(nbit, "uint8"))) & mask
+    i4 = (val >> (pos.astype(T.uint8) * tir.const(nbit, T.uint8))) & mask
 
-    i8_shifted = tir.reinterpret("int8", i4 << tir.const(4, "uint8"))
-    i8 = i8_shifted >> tir.const(4, "int8")
+    i8_shifted = tir.reinterpret(T.int8, i4 << tir.const(4, T.uint8))
+    i8 = i8_shifted >> tir.const(4, T.int8)
     return i8
 
 
@@ -35,15 +35,15 @@ def get_configs():
 @tilelang.jit(out_idx=[1])
 def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -66,13 +66,12 @@ def main(
 
 
 def torch_convert(tensor):
-
     def _convert(val, pos):
         assert val.dtype == torch.uint8
         val = val.view(torch.int8)
         mask = (1 << 4) - 1
-        i4_shifted = ((val >> (pos * 4)) & mask)
-        i4 = ((i4_shifted << 4) >> 4)
+        i4_shifted = (val >> (pos * 4)) & mask
+        i4 = (i4_shifted << 4) >> 4
 
         return i4.view(torch.int8)
 
@@ -86,7 +85,7 @@ def _convert(val, pos):
 
 
 def ref_program(A, qB):
-    dtypeC = "int32"
+    dtypeC = T.int32
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -94,11 +93,10 @@ def ref_program(A, qB):
 
 
 def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -109,12 +107,11 @@ def kernel_func(block_M, block_N, block_K, num_stages, threads):
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -123,10 +120,11 @@ def main(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -143,8 +141,7 @@ def main(
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         return main
 
@@ -167,10 +164,10 @@ def kernel(block_M, block_N, block_K, num_stages, threads):
 
 def main(m=128, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
-    if (not tune):
-        kernel = matmul_int8xint4(
-            m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
-                block_M=32, block_N=32, block_K=128, num_stages=1, threads=128)
+    if not tune:
+        kernel = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)(
+            block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+        )
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
         print("All checks pass.")
@@ -179,7 +176,7 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Tilelang: {latency} ms")
 
     else:
-        best_result = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)
+        best_result = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Bset latency: {best_latency}")
@@ -187,6 +184,14 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Best tflops: {total_flops / best_latency * 1e-9}")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096):
+    kernel = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=False)(
+        block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+    )
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--m", type=int, default=512, help="Matrix dimension M")
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index d3e90ec93..652ce3479 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -4,7 +4,8 @@
 import torch
 from tilelang import DataType
 from tilelang.quantize import (
-    _tir_packed_int_to_int_convert,)
+    _tir_packed_int_to_int_convert,
+)
 
 
 @tilelang.jit
@@ -16,7 +17,7 @@ def dequantize_gemv(
     out_dtype: str,
     accum_dtype: str,
     num_bits: int = 4,
-    storage_dtype: str = "int8",
+    storage_dtype: T.dtype = T.int8,
     source_format: str = "uint",
     n_partition: int = 4,
     reduce_thread: int = 32,
@@ -26,11 +27,10 @@ def dequantize_gemv(
     group_size: int = -1,
     with_scaling: bool = False,
 ) -> Callable[..., Any]:
-
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert trans_A is False, "Dequantize only implement for trans_A=False currently"
     assert trans_B is True, "Dequantize only implement for trans_B=TRue currently"
@@ -51,7 +51,7 @@ def dequantize_gemv(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     import_source: Optional[str] = None
     func_name: str = ""
@@ -81,12 +81,12 @@ def main(
         C: T.Tensor[C_shape, out_dtype],
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -107,8 +107,7 @@ def main(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 if fast_decoding:
@@ -120,10 +119,9 @@ def main(
                     )
                 else:
                     for ki in T.serial(micro_size_k):
-                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(
-                            storage_type,
-                            storage_nbit)(num_bits, B_quant_local[ki // num_elems_per_byte],
-                                          ki % num_elems_per_byte, in_dtype)
+                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(storage_type, storage_nbit)(
+                            num_bits, B_quant_local[ki // num_elems_per_byte], ki % num_elems_per_byte, in_dtype
+                        )
 
                 if use_dp4a:
                     for ki in T.serial(micro_size_k // dp4a_size):
@@ -137,9 +135,9 @@ def main(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -149,7 +147,8 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -160,11 +159,11 @@ def main() -> None:
     M = 1
     N = 1024
     K = 1024
-    in_dtype = "float16"
-    out_dtype = "float16"
-    accum_dtype = "float16"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float16
     num_bits = 4
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     source_format = "uint"
     n_partition = 4
     reduce_thread = 32
@@ -174,26 +173,39 @@ def main() -> None:
     group_size = -1
     with_scaling = False
 
-    kernel = dequantize_gemv(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits, storage_dtype,
-                             source_format, n_partition, reduce_thread, fast_decoding, trans_A,
-                             trans_B, group_size, with_scaling)
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
 
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     num_elems_per_byte = storage_nbit // num_bits
     A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
 
     if fast_decoding:
         from tilelang.quantize.utils import interleave_weight
+
         qB = interleave_weight(qB, num_bits, in_dtype)
     kernel(A, qB, C)
 
     # int4 reference
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for j in range(B.shape[1]):
         B[:, j] = ((qB[:, j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
 
@@ -205,5 +217,62 @@ def main() -> None:
     torch.testing.assert_close(C, ref_c, atol=1e3, rtol=1e-1)
 
 
+def run_regression_perf():
+    M = 1
+    N = 8192
+    K = 8192
+    in_dtype = "float16"
+    out_dtype = "float16"
+    accum_dtype = "float16"
+    num_bits = 4
+    storage_dtype = "int8"
+    source_format = "uint"
+    n_partition = 4
+    reduce_thread = 32
+    fast_decoding = True
+    trans_A = False
+    trans_B = True
+    group_size = -1
+    with_scaling = False
+
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
+
+    storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+    num_elems_per_byte = storage_nbit // num_bits
+    A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
+
+    if fast_decoding:
+        from tilelang.quantize.utils import interleave_weight
+
+        qB = interleave_weight(qB, num_bits, in_dtype)
+    kernel(A, qB, C)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(A, qB, C)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
index c4cf5fb50..6aad32bdb 100644
--- a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -25,6 +25,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[128],
         block_N=[64, 128, 256],
@@ -33,33 +34,33 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           topk,
-           E,
-           padding_M,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=128,
-           block_N=256,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    topk,
+    E,
+    padding_M,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=128,
+    block_N=256,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
     Construct and return a grouped (Mixture-of-Experts) matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized, expert-grouped B (shape ExNxQK) and writes an output of shape (M, topk, N) in out_dtype.
 
@@ -82,8 +83,8 @@ def matmul(M,
         topk (int): number of experts selected per token.
         E (int): number of experts.
         padding_M (int): padded number of tokens after grouping and block alignment.
-        in_dtype (str): element type of A (e.g., "bfloat16").
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
+        in_dtype (str): element type of A (e.g., T.bfloat16).
+        out_dtype (str): output tensor element type (e.g., T.bfloat16).
         accum_dtype (str): accumulation type used for the inner GEMM.
         source_format (str, optional): format string passed to intrinsic selector (default "uint").
         num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -110,16 +111,17 @@ def matmul(M,
     """
 
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, Block_QK)
-    Bias_shared_shape = (block_N)
+    Bias_shared_shape = block_N
     B_dequantize_shared_shape = (block_N, block_K)
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -135,7 +137,7 @@ def matmul(M,
     import_source = import_source
 
     # the dequant part is the same as in dequant_gemm
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
         The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
@@ -145,12 +147,12 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -221,19 +223,16 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale_shared,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
-
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
-
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
 
@@ -244,8 +243,8 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -254,19 +253,17 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((E, N, QK), storage_dtype),
-            Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
-            Bias: T.Tensor((E, N), out_dtype),
-            # Add fusedmoe tensors
-            topk_weights: T.Tensor((M * topk), out_dtype),
-            sorted_token_ids: T.Tensor((padding_M), "int32"),
-            expert_ids: T.Tensor((padding_M // block_M), "int32"),
-            C: T.Tensor((M, topk, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((E, N, QK), storage_dtype),
+        Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
+        Bias: T.Tensor((E, N), out_dtype),
+        # Add fusedmoe tensors
+        topk_weights: T.Tensor((M * topk), out_dtype),
+        sorted_token_ids: T.Tensor((padding_M), T.int32),
+        expert_ids: T.Tensor((padding_M // block_M), T.int32),
+        C: T.Tensor((M, topk, N), out_dtype),
     ):
-
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
             B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
@@ -274,23 +271,23 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
             topk_weights_shared = T.alloc_shared((block_M), out_dtype)
-            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")
-            expert_id = T.alloc_local((1), "int32")  # the expert id for the current block
+            sorted_token_ids_shared = T.alloc_shared((block_M), T.int32)
+            expert_id = T.alloc_local((1), T.int32)  # the expert id for the current block
             # To use 1D TMA, the last dim of Scale_shared must have stride=1
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             T.use_swizzle(10)
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
-            T.copy(sorted_token_ids[by * block_M:(by + 1) * block_M], sorted_token_ids_shared)
+            T.copy(sorted_token_ids[by * block_M : (by + 1) * block_M], sorted_token_ids_shared)
             expert_id[0] = expert_ids[by]
 
             # Get the topk weights of each token in the current block
@@ -300,11 +297,11 @@ def main(
 
             # Get bias and scale based on the expert id
             if with_bias:
-                T.copy(Bias[expert_id[0], bx * block_N:(bx + 1) * block_N], Bias_shared)
+                T.copy(Bias[expert_id[0], bx * block_N : (bx + 1) * block_N], Bias_shared)
             else:
                 T.clear(Bias_shared)
 
-            T.copy(Scale[expert_id[0], bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[expert_id[0], bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 C_local[i, j] = Bias_shared[j]
@@ -317,14 +314,13 @@ def main(
                     base = copy_i * threads * 16 + tx * 16
                     if sorted_token_ids_shared[base // block_K] != -1:
                         for copy_j in T.vectorized(16):
-                            A_shared[base // block_K, base % block_K +
-                                     copy_j] = A[sorted_token_ids_shared[base // block_K] // topk,
-                                                 k * block_K + base % block_K + copy_j]
+                            A_shared[base // block_K, base % block_K + copy_j] = A[
+                                sorted_token_ids_shared[base // block_K] // topk, k * block_K + base % block_K + copy_j
+                            ]
 
                 T.copy(B[expert_id[0], bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
 
@@ -338,16 +334,17 @@ def main(
                 base = copy_i * threads * 16 + tx * 16
                 if sorted_token_ids_shared[base // block_N] != -1:
                     for copy_j in T.vectorized(16):
-                        C[sorted_token_ids_shared[base // block_N] // topk,
-                          sorted_token_ids_shared[base // block_N] % topk, bx * block_N +
-                          base % block_N + copy_j] = C_shared[base // block_N,
-                                                              base % block_N + copy_j]
+                        C[
+                            sorted_token_ids_shared[base // block_N] // topk,
+                            sorted_token_ids_shared[base // block_N] % topk,
+                            bx * block_N + base % block_N + copy_j,
+                        ] = C_shared[base // block_N, base % block_N + copy_j]
 
     return main
 
 
 def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=256):
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     M, K = A.shape
     E, N, QK = qB.shape
     topk = topk_weights.shape[0] // M
@@ -355,7 +352,7 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
     assert scale_size == 32  # MXFP4
 
     # Initialize output tensor
-    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device='cuda')
+    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device="cuda")
 
     # Iterate over sorted_token_ids
     for idx in range(len(sorted_token_ids)):  # padding_M
@@ -370,14 +367,11 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
         # Dequantize the expert weights
         B = torch_convert_bit_twiddling(qB[expert_id])  # shape: (N, K)
-        B *= 2**(
-            Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(
-                torch.bfloat16))
+        B *= 2 ** (Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(torch.bfloat16))
 
         # Compute the output for this token-expert pair
         # token_embedding @ B.T + bias
-        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(
-            torch.bfloat16)) + Bias[expert_id]
+        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(torch.bfloat16)) + Bias[expert_id]
         output = output.to(torch.__getattribute__(dtypeC))
 
         # Apply the topk weight
@@ -391,14 +385,12 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
 
 def get_data(m, n, k, qk, scale_size, topk, E, block_M):
-    A = torch.empty(m, k, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-    qB = torch.randint(
-        0, 256, (E, n, qk), dtype=torch.uint8,
-        device='cuda')  #  Quantized weight tensor for E experts.
-    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device='cuda')
-    Bias = torch.empty(E, n, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-
-    weights = torch.empty(m, E, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
+    A = torch.empty(m, k, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+    qB = torch.randint(0, 256, (E, n, qk), dtype=torch.uint8, device="cuda")  #  Quantized weight tensor for E experts.
+    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device="cuda")
+    Bias = torch.empty(E, n, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+
+    weights = torch.empty(m, E, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
     # topk_weights: Router weights for the top-k experts for each token.
     # Shape: (m, topk)
     # tokens_experts: A flattened tensor of expert assignments for each token.
@@ -420,10 +412,7 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
         pad_len = ((cnt + block_M - 1) // block_M) * block_M - cnt
         if pad_len > 0:
             # -1 for padding (`M` instead in vLLM moe_align_block_size())
-            group_token_ids = torch.cat([
-                group_token_ids,
-                torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device='cuda')
-            ])
+            group_token_ids = torch.cat([group_token_ids, torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device="cuda")])
         padded_token_ids.append(group_token_ids)
         expert_ids.extend([eid] * ((cnt + block_M - 1) // block_M))
         start = end
@@ -431,21 +420,13 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
     # sorted_token_ids: The final flattened and padded tensor of token indices.
     sorted_token_ids = torch.cat(padded_token_ids, dim=0).to(torch.int32)  # (padding_M,)
     # expert_ids: The final tensor of expert IDs corresponding to `sorted_token_ids`.
-    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device='cuda')  # （padding_M,）
+    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device="cuda")  # （padding_M,）
     padding_M = sorted_token_ids.shape[0]  # padding_M: token number after padding
 
     return A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M
 
 
-def main(m=256,
-         n=256,
-         k=256,
-         scale_size=32,
-         topk=4,
-         E=32,
-         fast_dequant=True,
-         with_bias=False,
-         tune=False):
+def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
     # Tunable parameters
     block_M, block_N, block_K = 128, 256, 128  # noqa: F841
     num_stages = 1  # noqa: F841
@@ -456,8 +437,7 @@ def main(m=256,
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
     qk = k // num_elems_per_byte
-    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(
-        m, n, k, qk, scale_size, topk, E, block_M)
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
 
     if tune:
         with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
@@ -469,9 +449,9 @@ def main(m=256,
                 topk,
                 E,
                 padding_M,
-                "bfloat16",
-                "bfloat16",
-                "float32",
+                T.bfloat16,
+                T.bfloat16,
+                T.float32,
                 num_bits=num_bits,
                 scale_size=scale_size,
                 fast_dequant=fast_dequant,
@@ -485,9 +465,9 @@ def main(m=256,
             topk,
             E,
             padding_M,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=num_bits,
             scale_size=scale_size,
             fast_dequant=fast_dequant,
@@ -509,15 +489,11 @@ def main(m=256,
         sorted_token_ids,
         expert_ids,
     )
+    print("Tilelang kernel run finished.")
 
-    print('Tilelang kernel run finished.')
-
-    ref_output = ref_moe(
-        A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids,
-        block_M=block_M)  # Maybe a little bit slow...
+    ref_output = ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=block_M)  # Maybe a little bit slow...
 
-    latency = tilelang.profiler.do_bench(
-        lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
+    latency = tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
     print("Tilelang: {:.2f} ms".format(latency))
     print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
@@ -525,32 +501,72 @@ def main(m=256,
     max_val = diff.max()
     max_idx = diff.argmax()
     print(f"max abs diff: {max_val} at index: {max_idx}")
-    assert_similar(
-        output, ref_output, name="output",
-        eps=2e-5)  # We care about the similarity rather than abs. difference
+    assert_similar(output, ref_output, name="output", eps=2e-5)  # We care about the similarity rather than abs. difference
     print("All checks pass. ✅")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
+    block_M, block_N, block_K = 128, 256, 128
+    num_stages = 1
+    threads = 512
+    split = 1
+    num_bits = 4
+    num_elems_per_byte = 8 // num_bits
+    qk = k // num_elems_per_byte
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
+
+    if tune:
+        with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
+            kernel = matmul(
+                m,
+                n,
+                k,
+                topk,
+                E,
+                padding_M,
+                "bfloat16",
+                "bfloat16",
+                "float32",
+                num_bits=num_bits,
+                scale_size=scale_size,
+                fast_dequant=fast_dequant,
+                with_bias=with_bias,
+            )
+    else:
+        kernel = matmul(
+            m,
+            n,
+            k,
+            topk,
+            E,
+            padding_M,
+            "bfloat16",
+            "bfloat16",
+            "float32",
+            num_bits=num_bits,
+            scale_size=scale_size,
+            fast_dequant=fast_dequant,
+            with_bias=with_bias,
+            block_M=block_M,
+            block_N=block_N,
+            block_K=block_K,
+            num_stages=num_stages,
+            threads=threads,
+            split=split,
+        )
+
+    return tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
-    parser.add_argument("--N", type=int, default=5760, help="N")
-    parser.add_argument("--K", type=int, default=2944, help="K")
+    parser.add_argument("--M", type=int, default=256, help="M")  # From gpt-oss-20b MoE's first gemm
+    parser.add_argument("--N", type=int, default=256, help="N")
+    parser.add_argument("--K", type=int, default=256, help="K")
     parser.add_argument("--scale_size", type=int, default=32, help="scale size")
-    parser.add_argument(
-        "--topk", type=int, default=4, help="topk")  # experts activated for each token
+    parser.add_argument("--topk", type=int, default=4, help="topk")  # experts activated for each token
     parser.add_argument("--E", type=int, default=32, help="E")  # number of experts
     parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
 
-    main(
-        args.M,
-        args.N,
-        args.K,
-        args.scale_size,
-        topk=args.topk,
-        E=args.E,
-        fast_dequant=True,
-        with_bias=True,
-        tune=args.tune)
+    main(args.M, args.N, args.K, args.scale_size, topk=args.topk, E=args.E, fast_dequant=True, with_bias=True, tune=args.tune)
diff --git a/examples/dequantize_gemm/regression_example_dequantize_gemm.py b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
new file mode 100644
index 000000000..4ab03784f
--- /dev/null
+++ b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
@@ -0,0 +1,35 @@
+import tilelang.testing
+import example_dequant_gemm_bf16_fp4_hopper
+import example_dequant_gemm_bf16_mxfp4_hopper
+import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_w4a8
+import example_dequant_gemv_fp16xint4
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
+
+
+def regression_example_dequant_gemv_fp16xint4():
+    tilelang.testing.process_func(example_dequant_gemv_fp16xint4.run_regression_perf)
+
+
+def regression_example_dequant_gemm_fp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_fp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_bf16_fp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_bf16_fp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_bf16_mxfp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_bf16_mxfp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    tilelang.testing.process_func(example_dequant_groupedgemm_bf16_mxfp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_w4a8():
+    tilelang.testing.process_func(example_dequant_gemm_w4a8.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
index 01bc40e6c..a2f777222 100644
--- a/examples/dequantize_gemm/test_example_dequantize_gemm.py
+++ b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -3,7 +3,6 @@
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper
-import example_dequant_gemm_bf16_mxfp4_hopper_tma
 import example_dequant_groupedgemm_bf16_mxfp4_hopper
 import example_dequant_gemm_w4a8
 
@@ -25,12 +24,6 @@ def test_example_dequant_gemm_bf16_mxfp4_hopper():
     example_dequant_gemm_bf16_mxfp4_hopper.main()
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_dequant_gemm_bf16_mxfp4_hopper_tma():
-    example_dequant_gemm_bf16_mxfp4_hopper_tma.main()
-
-
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_dequant_groupedgemm_bf16_mxfp4_hopper():
diff --git a/examples/dsa_sparse_finetune/dsa.py b/examples/dsa_sparse_finetune/dsa.py
new file mode 100644
index 000000000..9fae8e5e3
--- /dev/null
+++ b/examples/dsa_sparse_finetune/dsa.py
@@ -0,0 +1,223 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from indexer_topk_reducesum import indexer_topk_reducesum_interface
+from indexer_bwd import indexer_bwd_interface
+from sparse_mla_fwd import sparse_mla_fwd_interface
+from sparse_mla_bwd import sparse_mla_bwd
+from sparse_mla_topk_reducesum import sparse_mla_topk_reducesum_interface
+from einops import einsum, repeat
+from utils import get_abs_err, get_err_ratio
+
+
+class RegsiterLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, loss):
+        ctx.save_for_backward(loss)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad):
+        loss = ctx.saved_tensors
+        return grad, torch.ones(1, dtype=loss[0].dtype, device=loss[0].device)
+
+
+register_loss = RegsiterLossFunction.apply
+
+
+def ref_deepseek_sparse_attention_innner(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    dtype = q.dtype
+    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32), (q, kv, index_q, index_k, weights))
+
+    index_sm_scale = index_q.shape[-1] ** -0.5
+    b, s = index_q.shape[:2]
+
+    # tl_topk_indices = tl_topk_indices.to(torch.int64)
+    # tl_topk_indices[tl_topk_indices == -1] = s
+
+    casual_mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+    index_logits = einsum(index_q, index_k, "b s1 h k, b s2 k -> b s1 h s2")
+    index_logits = F.relu(index_logits)
+    index_logits = (index_logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * index_sm_scale
+    index_logits = torch.where(casual_mask, index_logits, float("-inf"))
+    topk_indices = torch.topk(index_logits, k=topk, dim=-1).indices
+    topk_logits = torch.gather(F.pad(index_logits, (0, 1), value=float("-inf")), dim=-1, index=topk_indices)
+    topk_score = F.log_softmax(topk_logits, dim=-1, dtype=torch.float32)
+    index_topk_score = topk_score
+
+    if sm_scale is None:
+        sm_scale = kv.shape[-1] ** -0.5
+
+    h = q.shape[-2]
+    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda").scatter_(
+        dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool)
+    )[:, :, :-1]
+    mask = repeat(casual_mask & index_mask, "b s1 s2 -> b s1 h s2", h=h)
+    k, v = kv, kv[..., :dim_v]
+    logits = einsum(q, k, "b s1 h d, b s2 d -> b s1 h s2") * sm_scale
+    logits = torch.where(mask, logits, float("-inf"))
+    attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+    o = einsum(attn_score, v, "b s1 h s2, b s2 d -> b s1 h d")
+
+    attn_score = attn_score.sum(dim=-2)  # [b, s1, s2]
+    attn_topk_score = torch.gather(F.pad(attn_score, (0, 1)), dim=-1, index=topk_indices)
+    attn_topk_score = attn_topk_score / attn_topk_score.sum(dim=-1, keepdim=True)
+
+    loss = F.kl_div(index_topk_score.clip(-100, 0), attn_topk_score.detach().log().clip(-100, 0), log_target=True, reduction="sum")
+    o = register_loss(o, loss)
+
+    return o.to(dtype), topk_indices
+
+
+def ref_deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    all_o, all_topk_indices = [], []
+    for i in range(offsets.shape[0] - 1):
+        o, topk_indices = ref_deepseek_sparse_attention_innner(
+            q[None, offsets[i] : offsets[i + 1]],
+            kv[None, offsets[i] : offsets[i + 1]],
+            index_q[None, offsets[i] : offsets[i + 1]],
+            index_k[None, offsets[i] : offsets[i + 1]],
+            weights[None, offsets[i] : offsets[i + 1]],
+            topk,
+            dim_v,
+            sm_scale,
+            index_sm_scale,
+        )
+        all_o.append(o.squeeze(0))
+        all_topk_indices.append(topk_indices.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    return o, topk_indices
+
+
+class DSAFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        index_q: torch.Tensor,
+        index_k: torch.Tensor,
+        weights: torch.Tensor,
+        offsets: torch.Tensor,
+        topk: int,
+        dim_v: int,
+        sm_scale: Optional[float] = None,
+    ):
+        # topk_indices, index_score = ref_index_score(index_q, weights, index_k, topk)
+        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k, topk, offsets)
+        o, lse = sparse_mla_fwd_interface(q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
+        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets)
+        ctx.topk = topk
+        ctx.dim_v = dim_v
+        ctx.sm_scale = sm_scale
+        return o, topk_indices
+
+    @staticmethod
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        _1: torch.Tensor,
+    ):
+        q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets = ctx.saved_tensors
+        attn_score = sparse_mla_topk_reducesum_interface(
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets, dim_v=ctx.dim_v
+        ).squeeze(-2)
+        dq, dkv = sparse_mla_bwd(q, kv.unsqueeze(-2), o, do, topk_indices.unsqueeze(-2), lse, offsets, sm_scale=ctx.sm_scale)
+        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score, index_score, topk_indices, offsets)
+        return dq, dkv.squeeze(-2), dindex_q, dindex_k, dweights, None, None, None, None
+
+
+def deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+):
+    return DSAFunction.apply(q, kv, index_q, index_k, weights, offsets, topk, dim_v, sm_scale)
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    index_D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16().requires_grad_()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16().requires_grad_()
+    index_q = torch.randn((S, H, index_D)).cuda().bfloat16().requires_grad_()
+    weights = torch.randn((S, H)).cuda().bfloat16().requires_grad_()
+    index_k = torch.randn((S, index_D)).cuda().bfloat16().requires_grad_()
+    do = torch.randn((S, H, D)).cuda().bfloat16().requires_grad_()
+    offsets = torch.tensor([0, S // 2, S], dtype=torch.int32).cuda()
+
+    o, topk_indices = deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    o.backward(do)
+    q_grad, q.grad = q.grad, None
+    kv_grad, kv.grad = kv.grad, None
+    index_q_grad, index_q.grad = index_q.grad, None
+    index_k_grad, index_k.grad = index_k.grad, None
+    weights_grad, weights.grad = weights.grad, None
+
+    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    ref_o.backward(do)
+    ref_q_grad, q.grad = q.grad, None
+    ref_kv_grad, kv.grad = kv.grad, None
+    ref_index_q_grad, index_q.grad = index_q.grad, None
+    ref_index_k_grad, index_k.grad = index_k.grad, None
+    ref_weights_grad, weights.grad = weights.grad, None
+
+    print(f"o err: {get_abs_err(o, ref_o):.6f} ratio: {get_err_ratio(o, ref_o):.6f}")
+    print(f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}")
+    print(f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}")
+    print(
+        f"index_q.grad err: {get_abs_err(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f} ratio: {get_err_ratio(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f}"
+    )
+    print(f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}")
+    print(f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}")
+
+    intersections = []
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        mask = trt_np != -1
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+        intersections.append(len(intersection) / len(set_ref))
+    print("average intersections: {:.4f}".format(sum(intersections) / len(intersections)))
+
+
+test_kernel()
diff --git a/examples/dsa_sparse_finetune/index.py b/examples/dsa_sparse_finetune/index.py
new file mode 100644
index 000000000..5e4800411
--- /dev/null
+++ b/examples/dsa_sparse_finetune/index.py
@@ -0,0 +1,82 @@
+# Modified from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+import torch
+import torch.nn.functional as F
+import functools
+from typing import Callable, Any
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            (last_args is not None and last_kwargs is not None)
+            and (len(args) == len(last_args) and len(kwargs) == len(last_kwargs))
+            and all(a is b for a, b in zip(args, last_args, strict=False))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_lens(
+    lens: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    return F.pad(lens.cumsum(dim=0, dtype=dtype), (1, 0))
+
+
+@tensor_cache
+def prepare_lens_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device) for n in prepare_lens(cu_seqlens).unbind()])
+
+
+@tensor_cache
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+
+
+@tensor_cache
+def prepare_token_indices(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    position_ids = prepare_position_ids(cu_seqlens)
+    return torch.stack([prepare_sequence_ids(cu_seqlens), position_ids], 1).to(cu_seqlens)
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
new file mode 100644
index 000000000..68508ad4e
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -0,0 +1,254 @@
+import torch
+import torch.nn.functional as F
+from einops import einsum, repeat
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_bwd_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_I: int = 32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert num_stages == 0
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_I == 0
+    assert heads <= 64 and heads % 8 == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    dtype: str = BF16
+    accum_dtype: str = FP32
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    shape_p = [seq_len, topk]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.prim_func
+    def tl_indexer_bwd_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        dIndexQ: T.Tensor(index_q_shape, dtype),
+        dWeights: T.Tensor(weights_shape, dtype),
+        dIndexK: T.Tensor(index_k_shape, dtype),
+        AttnScore: T.Tensor(shape_p, FP32),
+        IndexScore: T.Tensor(shape_p, FP32),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos = Offsets[i_b]
+            num_blocks = T.ceildiv(topk, block_I)
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            weights_shared = T.alloc_shared([heads], dtype=dtype)
+
+            d_index_q_frag = T.alloc_fragment([heads, dim], dtype=accum_dtype)
+            d_weights_frag = T.alloc_fragment([heads], dtype=accum_dtype)
+
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.copy(Weights[bos + i_t, :], weights_shared)
+            T.fill(d_index_q_frag, 0)
+            T.fill(d_weights_frag, 0)
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+
+            for bi_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                i_st = bi_i * block_I
+                i_ed = (bi_i + 1) * block_I
+
+                indices_shared = T.alloc_shared([block_I], dtype=INT32)
+                T.copy(TopkIndices[bos + i_t, i_st:i_ed], indices_shared)
+
+                index_k_shared = T.alloc_shared([block_I, dim], dtype=dtype)
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t), IndexK[bos + pos, j], 0)
+
+                attn_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                index_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                for i in T.Parallel(block_I):
+                    attn_score_shared[i] = AttnScore[bos + i_t, i_st + i]
+                    index_score_shared[i] = IndexScore[bos + i_t, i_st + i]
+
+                logits = T.alloc_fragment((block_I, heads), accum_dtype)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                for i, j in T.Parallel(block_I, heads):
+                    logits[i, j] = T.max(logits[i, j], 0)
+
+                # dw
+                d_weights_i = T.alloc_fragment((block_I, heads), accum_dtype)
+                for i, j in T.Parallel(block_I, heads):
+                    d_weights_i[i, j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
+                T.reduce_sum(d_weights_i, d_weights_frag, dim=0, clear=False)
+
+                d_logits_qk = T.alloc_shared((block_I, heads), accum_dtype)
+                d_logits_qk_cast1 = T.alloc_fragment((block_I, heads), dtype)
+                d_logits_qk_cast2 = T.alloc_fragment((block_I, heads), dtype)
+
+                for i, j in T.Parallel(block_I, heads):
+                    d_relu = T.alloc_var(accum_dtype)
+                    if logits[i, j] > 0:
+                        d_relu = 1.0
+                    else:
+                        d_relu = 0.0
+                    d_logits_qk[i, j] = (index_score_shared[i] - attn_score_shared[i]) * d_relu * weights_shared[j]
+
+                # dq
+                T.copy(d_logits_qk, d_logits_qk_cast1)
+                T.gemm(
+                    d_logits_qk_cast1,  # [BS, HQ]
+                    index_k_shared,  # [BS, K]
+                    d_index_q_frag,  # [HQ, K]
+                    transpose_A=True,
+                    transpose_B=False,
+                    clear_accum=False,
+                )
+
+                # dk
+                T.copy(d_logits_qk, d_logits_qk_cast2)
+                d_index_k_frag = T.alloc_fragment([block_I, dim], dtype=accum_dtype)
+                T.gemm(
+                    d_logits_qk_cast2,  # [BS, HQ]
+                    index_q_shared,  # [HQ, K]
+                    d_index_k_frag,  # [BS, K]
+                    transpose_A=False,
+                    transpose_B=False,
+                    clear_accum=True,
+                )
+
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    if (pos > -1) & (pos <= i_t):
+                        T.atomic_add(dIndexK[bos + pos, j], d_index_k_frag[i, j])
+
+            for i, j in T.Parallel(heads, dim):
+                d_index_q_frag[i, j] = d_index_q_frag[i, j] * sm_scale
+
+            T.copy(d_index_q_frag, dIndexQ[bos + i_t, :, :])
+            T.copy(d_weights_frag, dWeights[bos + i_t, :])
+
+    return tl_indexer_bwd_kernel
+
+
+def indexer_bwd_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    attn_score: torch.Tensor,
+    index_score: torch.Tensor,
+    topk_indices: torch.Tensor,
+    offsets: torch.Tensor,
+):
+    _, heads, dim, topk = *q.shape, topk_indices.shape[-1]
+    token_indices = prepare_token_indices(offsets)
+    dq = torch.zeros_like(q)
+    dweights = torch.zeros_like(weights)
+    dk = torch.zeros_like(k)
+    kernel = tl_indexer_bwd_impl(heads, dim, topk)
+    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets, token_indices)
+    return dq, dweights, dk
+
+
+def ref_indexer_bwd(
+    Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, AttnScore: torch.Tensor, offsets: torch.Tensor
+) -> torch.Tensor:
+    Q.requires_grad_(True)
+    Weights.requires_grad_(True)
+    K.requires_grad_(True)
+    softmax_scale = Q.shape[-1] ** -0.5
+    all_loss = []
+    all_log_topk_prob = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= TopkIndices.shape[-1]
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        attn_score = AttnScore[offsets[i] : offsets[i + 1]]
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2") * softmax_scale
+        logits = F.relu(logits)
+        score = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32)
+        score = torch.where(mask, score, float("-inf"))
+        topk_value = torch.gather(score, dim=-1, index=topk_indices.to(torch.int64))
+        log_topk_prob = F.log_softmax(topk_value, dim=-1, dtype=torch.float32)
+        loss = F.kl_div(log_topk_prob.clip(-100, 0), attn_score.log().clip(-100, 0), log_target=True, reduction="sum")
+        all_loss.append(loss)
+        all_log_topk_prob.append(log_topk_prob)
+    loss = torch.stack(all_loss).sum()
+    loss.backward()
+    log_topk_prob = torch.cat(all_log_topk_prob, dim=0)
+    return log_topk_prob.exp(), Q.grad, Weights.grad, K.grad
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    w = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    all_attn_score = []
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(topk)[None, :]).to(q.device)
+        logits = torch.ones(seq_len, topk).cuda()
+        logits = torch.where(mask, logits, float("-inf"))
+        attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        all_attn_score.append(attn_score)
+    attn_score = torch.cat(all_attn_score, dim=0)
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score, offsets)
+
+    dq, dw, dk = indexer_bwd_interface(q, w, k, attn_score, index_score, topk_indices, offsets)
+
+    print(f"dq err: {get_abs_err(dq, ref_dq):.6f} ratio: {get_err_ratio(dq, ref_dq):.6f}")
+    print(f"dq err: {get_abs_err(dw, ref_dw):.6f} ratio: {get_err_ratio(dw, ref_dw):.6f}")
+    print(f"dq err: {get_abs_err(dk, ref_dk):.6f} ratio: {get_err_ratio(dk, ref_dk):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
new file mode 100644
index 000000000..d76eb0272
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -0,0 +1,273 @@
+import math
+import torch
+import torch.nn.functional as F
+from einops import einsum
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_topk_reducesum_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_K: int = 32,
+    dtype: str = FP32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_K == 0
+    assert heads <= 64 and heads % 8 == 0
+    assert num_stages == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    N = 2 * topk
+    num_iters = int(round(math.log2(N)))
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.macro
+    def bitonic_sort(
+        topk_index_shared: T.SharedBuffer([N], dtype=INT32),
+        topk_value_shared: T.SharedBuffer([N], dtype=FP32),
+    ):
+        T.sync_threads()
+        for i1 in T.serial(num_iters):
+            for i2 in T.serial(i1 + 1):
+                for i in T.Parallel(N):
+                    ascending = (i & (1 << (i1 + 1))) != 0
+                    j = i ^ (1 << (i1 - i2))
+                    if i < j and (
+                        (ascending and topk_value_shared[i] > topk_value_shared[j])
+                        or (not ascending and topk_value_shared[i] < topk_value_shared[j])
+                    ):
+                        val = topk_value_shared[i]
+                        topk_value_shared[i] = topk_value_shared[j]
+                        topk_value_shared[j] = val
+                        idx = topk_index_shared[i]
+                        topk_index_shared[i] = topk_index_shared[j]
+                        topk_index_shared[j] = idx
+                T.sync_threads()
+
+    @T.prim_func
+    def tl_indexer_topk_reducesum_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        ReduceSum: T.Tensor(topk_indices_shape, FP32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos, eos = Offsets[i_b], Offsets[i_b + 1]
+            num_blocks = T.ceildiv(i_t + 1, block_K)
+
+            topk_index_shared = T.alloc_shared([N], dtype=INT32)
+            topk_value_shared = T.alloc_shared([N], dtype=FP32)
+
+            T.fill(topk_index_shared, -1)
+            T.fill(topk_value_shared, float("-inf"))
+            T.sync_threads()
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.sync_threads()
+
+            weights_frag = T.alloc_shared([heads], dtype=dtype)
+            T.copy(Weights[bos + i_t, :], weights_frag)
+            T.sync_threads()
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+            T.sync_threads()
+
+            for bk_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                k_st = bk_i * block_K
+                k_ed = T.min((bk_i + 1) * block_K, eos - bos)
+
+                index_k_shared = T.alloc_shared([block_K, dim], dtype=dtype)
+                for i, j in T.Parallel(block_K, dim):
+                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i, j], 0)
+                T.sync_threads()
+
+                logits = T.alloc_fragment((block_K, heads), FP32)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                T.sync_threads()
+
+                for i, j in T.Parallel(block_K, heads):
+                    logits[i, j] = T.max(logits[i, j], 0) * weights_frag[j]
+                T.sync_threads()
+
+                logits_sum = T.alloc_fragment(block_K, FP32)
+                T.reduce_sum(logits, logits_sum, dim=1)
+                T.sync_threads()
+
+                offset = T.alloc_var(INT32)
+                if k_st >= topk:
+                    offset = topk + (k_st % topk)
+                else:
+                    offset = k_st
+                T.sync_threads()
+                for i in T.Parallel(block_K):
+                    if k_st + i > i_t:
+                        logits_sum[i] = float("-inf")
+                    j = offset + i
+                    topk_index_shared[j] = k_st + i
+                    topk_value_shared[j] = logits_sum[i]
+                T.sync_threads()
+
+                if k_ed > topk and k_ed % topk == 0:
+                    bitonic_sort(topk_index_shared, topk_value_shared)
+
+            bitonic_sort(topk_index_shared, topk_value_shared)
+
+            logits_max_frag = T.alloc_fragment([1], dtype=FP32)
+            logits_frag = T.alloc_fragment([topk], dtype=FP32)
+            reducesum_shared = T.alloc_shared([topk], dtype=FP32)
+
+            T.copy(topk_value_shared[:topk], logits_frag)
+            T.sync_threads()
+
+            T.reduce_max(logits_frag, logits_max_frag, dim=-1)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                logits_frag[i] = T.exp(logits_frag[i] - logits_max_frag[0])
+            T.sync_threads()
+
+            lse_frag = T.alloc_fragment([1], dtype=FP32)
+            T.reduce_sum(logits_frag, lse_frag)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                reducesum_shared[i] = logits_frag[i] / lse_frag[0]
+            T.sync_threads()
+
+            # for i in T.Parallel(topk):
+            #     reducesum_shared[i] = logits_frag[i]
+            # T.sync_threads()
+
+            for i in T.Parallel(topk):
+                if topk_index_shared[i] > i_t:
+                    topk_index_shared[i] = -1
+            T.sync_threads()
+
+            T.copy(topk_index_shared[:topk], TopkIndices[bos + i_t, :])
+            T.copy(reducesum_shared[:topk], ReduceSum[bos + i_t, :])
+
+    return tl_indexer_topk_reducesum_kernel
+
+
+def indexer_topk_reducesum_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    topk: int,
+    offsets: torch.Tensor,
+    dtype: str = BF16,
+):
+    seq_len, heads, dim = q.shape
+    kernel = tl_indexer_topk_reducesum_impl(heads=heads, dim=dim, topk=topk, dtype=dtype)
+    token_indices = prepare_token_indices(offsets)
+    topk_indices = torch.zeros((seq_len, topk), device=q.device, dtype=torch.int32)
+    topk_score = torch.zeros((seq_len, topk), device=q.device, dtype=torch.float32)
+    kernel(q, weights, k, topk_indices, topk_score, offsets, token_indices)
+    return topk_indices, topk_score
+
+
+def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int, offsets: torch.Tensor) -> torch.Tensor:
+    all_topk_indices = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= topk
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        softmax_scale = q.shape[-1] ** -0.5
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2")
+        logits = F.relu(logits)
+        logits = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * softmax_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        topk_logits, topk_indices = torch.topk(logits, k=topk, dim=-1)
+        topk_score = F.softmax(topk_logits, dim=-1, dtype=torch.float32)
+        all_topk_indices.append(topk_indices)
+        all_topk_score.append(topk_score)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return topk_indices, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=64,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    weights = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, S], dtype=torch.int32).cuda()
+
+    ref_topk_indices, ref_topk_score = ref_index_score(q, weights, k, topk, offsets)
+
+    topk_indices, topk_score = indexer_topk_reducesum_interface(q, weights, k, topk, offsets)
+
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        ref_np_val = ref_topk_score[j]
+        trt_np_val = topk_score[j]
+
+        mask = (ref_np_val > 0).cpu().numpy()
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+
+        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
+
+        print(f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
new file mode 100644
index 000000000..06eaa8eb3
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -0,0 +1,347 @@
+# ruff: noqa
+import tilelang
+from tilelang import language as T
+import torch
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(out_idx=[-1])
+def preprocess(
+    H,
+    D,
+    block_ND=32,
+    num_stages=5,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    S = T.symbolic("S")
+
+    shape = [S, H, D]
+
+    @T.prim_func
+    def preprocess_kernel(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([S, H], accum_dtype),
+    ):
+        with T.Kernel(H, T.ceildiv(S, block_ND)) as (bx, by):
+            o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            do = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            delta = T.alloc_fragment([block_ND], accum_dtype)
+            acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            T.clear(acc)
+            for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
+                T.copy(O[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
+                for i, j in T.Parallel(block_ND, block_ND):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[by * block_ND : (by + 1) * block_ND, bx])
+
+    return preprocess_kernel
+
+
+@tilelang.jit(out_idx=[-1])
+def postprocess(
+    D,
+    D_tail,
+    kv_group=1,
+    block_N=64,
+    threads=128,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    S_kv = T.symbolic("S_kv")
+
+    dkv_shape = [S_kv, kv_group, D + D_tail]
+
+    @T.prim_func
+    def postprocess_kernel(
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, threads=threads) as (bx, by):
+            T.copy(
+                dKV[bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bx * block_N : (bx + 1) * block_N, by, :],
+            )
+
+    return postprocess_kernel
+
+
+@tilelang.jit(
+    out_idx=[-2],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def bwd(
+    H,
+    D,
+    D_tail,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_size=32,
+    num_stages=0,
+    threads=128,
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
+
+    if sm_scale is None:
+        sm_scale = (D + D_tail) ** (-0.5)
+
+    B_plus_one = T.symbolic("B_plus_one")
+    S = T.symbolic("S")
+
+    H_kv = H // kv_group
+    q_shape = [S, H, D + D_tail]
+    k_shape = [S, kv_group, D + D_tail]
+    o_shape = [S, H, D]
+    indices_shape = [S, kv_group, topk]
+    delta_shape = [S, H]
+    lse_shape = [S, H]
+    offsets_shape = [B_plus_one]
+    token_indices_shape = [S, 2]
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    H = H_kv
+    padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    BS = block_size
+    NS = tilelang.cdiv(topk, block_size)
+
+    split_store = 2
+
+    @T.prim_func
+    def sparse_mla_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        Offsets: T.Tensor(offsets_shape, indices_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
+    ):
+        with T.Kernel(S, kv_group, threads=threads) as (b_s_i, bz):
+            Q_shared = T.alloc_shared([padded_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            KV_shared = T.alloc_shared([BS, D], dtype)
+            KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
+            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            mask = T.alloc_fragment([BS], "bool")
+
+            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dQ_shared = T.alloc_shared([padded_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+
+            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
+            acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
+            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
+            acc_dkv_tail_shared = T.view(KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+
+            max_kv_i = s_i
+
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
+
+            T.clear(acc_dq)
+            T.clear(acc_dq_tail)
+
+            # Process each block of indices
+            for i_i in T.Pipelined(NS, num_stages=num_stages):
+                # Check which indices are valid
+                for bi_i in T.Parallel(BS):
+                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
+
+                # Compute attention scores
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
+
+                # Load KV, V for this block of indices
+                for bi_i, d_i in T.Parallel(BS, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, d_i]
+
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for bi_i, d_i in T.Parallel(BS, D_tail):
+                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale - Lse[bos + s_i, bz * padded_H + h_i])
+
+                T.copy(acc_p, P_shared_cast)
+
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
+
+                T.copy(acc_dp, dP_shared_cast)
+                T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
+
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                T.clear(acc_dkv_tail)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for s in range(split_store):
+                    for bi_i, d_i in T.Parallel(BS, D):
+                        if bi_i < BS // split_store:
+                            acc_dkv_shared[bi_i, d_i] = acc_dkv[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS, D_tail):
+                        if bi_i < BS // split_store:
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS // split_store, D):
+                        T.atomic_add(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i],
+                            acc_dkv_shared[bi_i, d_i],
+                        )
+
+                    # Atomically update dKV, dKV_tail tensors
+                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail):
+                        T.atomic_add(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i],
+                            acc_dkv_tail_shared[bi_i, d_i],
+                        )
+
+            # Store the accumulated dQ
+            T.copy(acc_dq, dQ_shared)
+            T.copy(acc_dq_tail, dQ_tail_shared)
+
+            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:])
+
+    return sparse_mla_bwd_kernel
+
+
+def sparse_mla_bwd(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
+    assert q.is_contiguous()
+    assert kv.is_contiguous()
+    assert indices.is_contiguous()
+    assert lse.is_contiguous()
+    S, H, dim_plus_tail_dim = q.shape
+    S_kv, kv_group, _ = kv.shape
+    assert kv.shape[-1] == dim_plus_tail_dim
+    assert S == S_kv
+    # dim should be assigned
+    D = 512
+
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    assert indices.shape == (S, kv_group, topk)
+    assert lse.shape == (S, H)
+
+    token_indices = prepare_token_indices(offsets)
+
+    # Get kernels
+    preprocess_kernel = preprocess(H, D)
+    bwd_kernel = bwd(H, D, D_tail, topk, kv_group, sm_scale, is_casual)
+    postprocess_kernel = postprocess(D, D_tail, kv_group)
+
+    if delta is None:
+        delta = preprocess_kernel(o, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+    dq = bwd_kernel(q, kv, do, indices, lse, delta, offsets, token_indices, dkv)
+    dkv = postprocess_kernel(dkv)
+
+    return dq, dkv
+
+
+def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True):
+    from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
+    q = q.detach().clone()
+    kv = kv.detach().clone()
+    q.requires_grad = True
+    kv.requires_grad = True
+    o = ref_sparse_mla_fwd_interface(q, kv, indices, offsets, sm_scale, is_casual)
+    o.backward(do)
+    return q.grad, kv.grad
+
+
+def test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True):
+    # Prepare data
+    q = torch.randn((S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((S, H, DV), dtype=dtype, device="cuda")
+    offsets = torch.tensor([0, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    # Forward
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets)
+
+    tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+    ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None, offsets)
+
+    if check_correctness:
+        assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
+        assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
+        print("assert_tensors_similar passed")
+
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
+    from tilelang.profiler import do_bench
+
+    def fn():
+        return sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+
+    ms = do_bench(fn, rep=100, warmup=250)
+    print(f"Average time: {ms:.3f} ms")
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
new file mode 100644
index 000000000..d87523695
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -0,0 +1,310 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+    else:
+        sm_scale = sm_scale
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    head_kv = heads // kv_group
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len, kv_group, dim + tail_dim]
+    o_shape = [seq_len, heads, dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+            T.fill(sumexp, 0)
+            T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.copy(m_i, m_i_prev)
+                T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    alpha[h_i] = T.exp((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale)
+                T.reduce_sum(acc_s, sumexp_i, dim=1)  # is this a accumulate operator?
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
+                for h_i, d_i in T.Parallel(H_per_block, D):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] * alpha[h_i]
+
+                T.copy(acc_s, S_shared)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Rescale
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] /= sumexp[h_i]
+            for h_i in T.Parallel(H_per_block):
+                sumexp[h_i] = T.log(sumexp[h_i]) + m_i[h_i] * sm_scale
+
+            T.copy(acc_o, Output[bos + s_i, H0:H1, :])
+            T.copy(sumexp, Lse[bos + s_i, H0:H1])
+
+    return main
+
+
+def sparse_mla_fwd_interface(
+    q, kv, indices, offsets, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=32, num_stages=2, threads=128
+):
+    is_casual = True
+    assert return_p_sum == False, "This kernel file is for fwd only"
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    seq_len, heads, dim_plus_tail_dim = q.shape
+    seq_len_kv, kv_group, _ = kv.shape
+    assert seq_len == seq_len_kv
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = d_v
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, topk = indices.shape
+    assert indices.shape == (seq_len, kv_group, topk)
+
+    token_indices = prepare_token_indices(offsets)
+
+    kernel = sparse_mla_fwd(
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
+    out, lse = kernel(q, kv, indices, offsets, token_indices)
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casual=True):
+    Q = Q.float()
+    KV = KV.float()
+    all_o = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[None, offsets[i] : offsets[i + 1]]
+        kv = KV[None, offsets[i] : offsets[i + 1]]
+        indices = Indices[None, offsets[i] : offsets[i + 1]].clone()
+
+        indices = indices.transpose(1, 2)
+        b, sq, h, dim_q = q.shape
+        b, sk, g, _ = kv.shape
+
+        assert kv.shape[-1] == 576, "you should assign dim otherwise"
+        dim = 512
+        k = kv
+        v = kv[..., :dim]
+
+        b, _, _, dim_v = v.shape
+        g_index = g
+        h_index = h // g
+        compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+        ).view(1, -1)
+
+        indices[indices > sk] = sk
+        mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+        mask = mask[..., :-1]
+        mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+        mask[:, :, : 1 - 1, 0] = True
+        mask = mask.view(b, g_index, 1, sq, sk)
+
+        q = q.view(b, sq, g, -1, dim_q)
+        score = torch.einsum("bmghd,bngd->bghmn", q, k)
+        sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+        score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+        p = score.softmax(dim=-1)
+        p = p.view(b, g_index, h_index, -1, sq, sk)
+        p = p.view(b, g, -1, sq, sk)
+        o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+        o = o.reshape(b, sq, h, dim_v)
+        all_o.append(o.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
+    torch.random.manual_seed(0)
+    q = torch.randn((S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    offsets = torch.tensor([0, S // 2 - 1, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    if check_correctness:
+        # otherwise may cause out of memory
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, offsets)
+        assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
+        print("assert_tensors_similar passed")
+
+    def fn():
+        return sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=100,
+        warmup=250,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print("fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_fwd(
+        B=1,
+        S=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=1024,
+        dtype=torch.bfloat16,
+        check_correctness=True,
+        block_I=64,
+        num_stages=2,
+        threads=256,
+    )
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
new file mode 100644
index 000000000..a03bc74f5
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -0,0 +1,226 @@
+# ruff: noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tilelang
+from tilelang import language as T
+from einops import repeat, rearrange, einsum
+from index import prepare_token_indices
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def tl_sparse_mla_topk_reducesum_impl(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    seq_len_kv = T.symbolic("seq_len_kv")
+
+    head_kv = heads // kv_group
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len_kv, kv_group, dim + tail_dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    reducesum_shape = [seq_len, kv_group, REPLICATE_H, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    @T.prim_func
+    def tl_sparse_mla_topk_reducesum_kernel(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            reducesum = T.alloc_fragment([BI], accum_dtype)
+            lse = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(lse, 0)
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            r_i = bx % REPLICATE_H
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+            T.copy(Lse[bos + s_i, H0:H1], lse)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - lse[h_i])
+                T.reduce_sum(acc_s, reducesum, dim=0)
+                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI : i_i * BI + BI])
+
+    return tl_sparse_mla_topk_reducesum_kernel
+
+
+def sparse_mla_topk_reducesum_interface(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    topk_indices: torch.Tensor,
+    lse: torch.Tensor,
+    offsets: torch.Tensor,
+    dim_v: int,
+):
+    assert kv.shape[-2] == 1
+    seq_len, heads, dim_plus_tail_dim, topk = *q.shape, topk_indices.shape[-1]
+    REPLICATE_H = max(heads // 64, 1)
+    tail_dim = dim_plus_tail_dim - dim_v
+    token_indices = prepare_token_indices(offsets)
+
+    reducesum = torch.zeros([seq_len, 1, REPLICATE_H, topk], dtype=torch.float32, device=q.device)
+    kernel = tl_sparse_mla_topk_reducesum_impl(heads=heads, dim=dim_v, tail_dim=tail_dim, topk=topk)
+    kernel(q, kv, topk_indices, lse, offsets, token_indices, reducesum)
+    reducesum = reducesum.sum(dim=-2)  # [batch, seq_len, 1, RH, topk] -> [batch, seq_len, 1, topk]
+    attn_score = reducesum / reducesum.sum(dim=-1, keepdim=True)
+
+    return attn_score
+
+
+def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, offsets: torch.Tensor):
+    # q: [batch, seq_len, heads, dim]
+    # k: [batch, seq_len, dim]
+    sm_scale = Q.shape[-1] ** -0.5
+    all_lse = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        seq_len = q.shape[0]
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
+        logits = einsum(q, k, "s1 h d, s2 d -> s1 h s2") * sm_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        score_sum = score.sum(dim=-2)
+        topk_score = torch.gather(score_sum, dim=-1, index=topk_indices.to(torch.int64))
+        topk_score = topk_score / topk_score.sum(dim=-1, keepdim=True)
+        max_logits = logits.amax(dim=-1).to(torch.float32)
+        lse = torch.log((logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
+        all_lse.append(lse)
+        all_topk_score.append(topk_score)
+    lse = torch.cat(all_lse, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return lse, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    topk=128,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+
+    lse, ref_attn_score = ref_mla_topk_softmax(q, kv, topk_indices, offsets)
+
+    kv = kv.unsqueeze(-2)
+    topk_indices = topk_indices.unsqueeze(-2)
+
+    attn_score = sparse_mla_topk_reducesum_interface(q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
+    print(f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/utils.py b/examples/dsa_sparse_finetune/utils.py
new file mode 100644
index 000000000..96afd064d
--- /dev/null
+++ b/examples/dsa_sparse_finetune/utils.py
@@ -0,0 +1,73 @@
+import torch
+
+
+def get_abs_err(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    return (x - y).flatten().abs().max().item()
+
+
+def get_err_ratio(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    err = (x - y).flatten().square().mean().sqrt().item()
+    base = (x).flatten().square().mean().sqrt().item()
+    return err / base
+
+
+def calculate_tensor_similarity(x, y, name="tensor"):
+    """
+    Calculate similarity between two tensors using a normalized dot product metric.
+
+    Unlike torch.testing.assert_close which uses absolute/relative tolerance based on
+    element-wise differences, this function computes a global similarity score:
+        sim = 2 * <x, y> / (||x||^2 + ||y||^2)
+
+    This metric is scale-invariant and measures the cosine-like similarity normalized
+    by the magnitude of both tensors. It returns 1 for identical tensors and values
+    closer to 0 for dissimilar ones. This is particularly useful for comparing tensors
+    with varying magnitudes where relative errors matter more than absolute differences.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        name: Name of the tensor for logging purposes
+
+    Returns:
+        Similarity score in range [0, 1] where 1 means identical
+    """
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print(f"\033[33mWARNING: {name} all zero\033[0m")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
+    """
+    Assert that two tensors are similar using a global similarity metric.
+
+    Key differences from torch.testing.assert_close:
+    - torch.testing.assert_close: Uses element-wise comparison with rtol/atol, checking
+      that |x - y| <= atol + rtol * |y| for each element. It's sensitive to outliers
+      and requires all elements to satisfy the tolerance.
+    - assert_tensors_similar: Uses a single global similarity score (1 - sim) where sim is the
+      normalized dot product. It's more robust to outliers and focuses on overall
+      tensor similarity rather than element-wise precision. This is better suited for
+      comparing large tensors where a few outlier elements shouldn't fail the test.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        eps: Maximum allowed difference (1 - similarity), default 1e-8
+        name: Name of the tensor for error messages
+        raise_assert: Whether to raise assertion error on failure
+    """
+    sim = calculate_tensor_similarity(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
+        if raise_assert:
+            assert False  # noqa: B011
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
index be018c8b7..e338d76ca 100644
--- a/examples/dynamic_shape/example_dynamic.py
+++ b/examples/dynamic_shape/example_dynamic.py
@@ -1,10 +1,9 @@
 import tilelang
 import tilelang.language as T
 import tilelang.testing
-from tilelang import tvm as tvm
 
 
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
+@tilelang.jit
 def matmul_dynamic_mnk(
     block_M,
     block_N,
@@ -17,9 +16,9 @@ def matmul_dynamic_mnk(
     num_stages,
     threads,
 ):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
+    M = T.dynamic("m")
+    N = T.dynamic("n")
+    K = T.dynamic("k")
 
     A_shape = (K, M) if trans_A else (M, K)
     B_shape = (N, K) if trans_B else (K, N)
@@ -29,9 +28,9 @@ def matmul_dynamic_mnk(
 
     @T.prim_func
     def dynamic_matmul(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -53,15 +52,14 @@ def dynamic_matmul(
     return dynamic_matmul
 
 
-def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads):
+def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads):
     print(
         f"M: {M}, N: {N}, K: {K}, block_M: {block_M}, block_N: {block_N}, block_K: {block_K}, trans_A: {trans_A}, trans_B: {trans_B}, in_dtype: {in_dtype}, out_dtype: {out_dtype}, accum_dtype: {accum_dtype}, num_stages: {num_stages}, threads: {threads}"
     )
-    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                                accum_dtype, num_stages, threads)
+    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 
     import torch
+
     if trans_A:
         A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
     else:
@@ -103,8 +101,30 @@ def main(M=16384, N=16384, K=16384):
     accum_dtype = "float32"
     num_stages = 3
     threads = 128
-    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads)
+    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
+
+
+def run_regression_perf(M=4096, N=4096, K=4096):
+    block_M, block_N, block_K = 128, 128, 32
+    trans_A, trans_B = False, False
+    in_dtype, out_dtype = "float16", "float16"
+    accum_dtype = "float32"
+    num_stages = 3
+    threads = 128
+    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
+    import torch
+
+    if trans_A:
+        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
+    else:
+        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
+    if trans_B:
+        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
+    else:
+        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
+    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(input_tensors=[A, B, C], backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/dynamic_shape/regression_example_dynamic.py b/examples/dynamic_shape/regression_example_dynamic.py
new file mode 100644
index 000000000..958695990
--- /dev/null
+++ b/examples/dynamic_shape/regression_example_dynamic.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_dynamic
+
+
+def regression_example_dynamic():
+    tilelang.testing.process_func(example_dynamic.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/dynamic_shape/test_example_dynamic.py b/examples/dynamic_shape/test_example_dynamic.py
deleted file mode 100644
index 36a3743f1..000000000
--- a/examples/dynamic_shape/test_example_dynamic.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import tilelang.testing
-import example_dynamic
-
-
-def test_example_dynamic():
-    example_dynamic.main(M=1024, N=1024, K=1024)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/examples/eager_jit/eagerjit.en.ipynb b/examples/eager_jit/eagerjit.en.ipynb
new file mode 100644
index 000000000..6a2bf8453
--- /dev/null
+++ b/examples/eager_jit/eagerjit.en.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Eager JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Eager JIT merges JIT kernel generation and invocation into a single workflow.\n",
+    "\n",
+    "The function signature looks similar to Triton, but we add many enhancements; the most important one is allowing rich Tensor annotations:\n",
+    "\n",
+    "* If a Tensor has complex shape constraints, we can move its annotation into the function body.\n",
+    "* Use `T.const` or `T.dynamic` to create shape variables, then annotate complex Tensors with `T.Tensor`.\n",
+    "* Use `T.empty` to declare return tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "Calling the function with Tensors directly triggers the full JIT compile-and-run pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "Changing the call arguments may trigger a recompilation when compilation parameters change:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "You can also explicitly call the `compile` method to build the kernel.\n",
+    "\n",
+    "1. `ker.compile` compiles the kernel\n",
+    "2. `ker.get_tir` retrieves the TIR\n",
+    "3. `ker.par_compile` compiles in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### Use macros to separate implementation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "Next, we implement a simple GEMM in several different ways. For convenience, we first write a macro that contains the core GEMM logic:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### Use `T.dynamic` to mark dynamic shapes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### Use `T.StridedTensor` to annotate tensors with strides\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### Use parameters directly as annotations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "You can directly use function parameters in the annotations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### Annotations for runtime variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "Runtime variables work the same; if the function annotation becomes too long, you can move it into the function body."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### Constraints for constants"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "A constant annotation created by `T.const` must be used directly at least once, otherwise an error is raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### Dynamic dimensions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "If you want certain parameters in a Tensor annotation to change, it is recommended to switch to the `T.ptr` + `T.match_buffer` style."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### Default arguments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "Scalar annotations like `T.float32` can carry default values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## Overhead of argument matching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "EagerJIT has very small overhead; each additional constant annotation costs about 200 ns.\n",
+    "* 200 ns is roughly the cost of an FFI call that reads parameters from a `torch.Tensor`'s shape/stride."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## Compilation and parallel compilation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "Both EagerJIT and the original `jit` (i.e. LazyJIT) support parallel compilation.\n",
+    "\n",
+    "To avoid wasting memory on temporary `torch.Tensor` objects, you can use `T.Tensor` to create placeholders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## More convenient macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang's macros have been improved:\n",
+    "\n",
+    "1. Allow using `T.Ref` as an annotation, similar to C++ references.\n",
+    "2. Allow returning multiple values.\n",
+    "3. Allow nesting and recursion."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### Passing references with `T.Ref`\n",
+    "\n",
+    "A `T.Ref` reference can point to a scalar variable or to an element of a buffer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # Supports constant indices\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # Also supports variable indices\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### Pass macros as arguments\n",
+    "\n",
+    "You can pass a macro as a function argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Recursive macros\n",
+    "\n",
+    "You may not need this often, but macros can be recursive as long as the termination condition is known at compile time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macros returning multiple values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/eager_jit/eagerjit.zh.ipynb b/examples/eager_jit/eagerjit.zh.ipynb
new file mode 100644
index 000000000..0f7c9be99
--- /dev/null
+++ b/examples/eager_jit/eagerjit.zh.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT 将 jit 生成和调用的逻辑合并到一起\n",
+    "\n",
+    "函数签名的写法与 triton 相似，但做了大量增强，最主要的增强是允许对 Tensor 的标注：\n",
+    "\n",
+    "* 如果一个 Tensor 有复杂的 shape 约束，我们可以把它的标注移动到函数内部\n",
+    "* 通过 `T.const` 或 `T.dynamic` 来建立一些 shape 变量，然后用 `T.Tensor` 标注复杂的 Tensor\n",
+    "* 用 `T.empty` 来声明返回值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "直接将 Tensor 作为参数调用，即可触发完整的 jit 编译运行流程："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "更改调用的参数，如果编译器参数发生了变化，会触发重新编译："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "你也可以手动调用 compile 函数编译 kernel\n",
+    "\n",
+    "1. `ker.compile` 编译 kernel\n",
+    "2. `ker.get_tir` 获取 tir\n",
+    "3. `ker.par_compile` 并行编译"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### 用 macro 来分离实现"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "接下来，我们会用各种方式来实现一个简单的 gemm，为了方便，我们先写一个 macro 把 gemm 的主要逻辑写出来："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### 用 T.dynamic 标记动态 Shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### 用 T.StridedTensor 标记带 stride 的 Tensor\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### 直接用参数当 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "可以直接把函数参数写到 annotation 里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### 对运行时变量的 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "运行时变量也是一样，如果嫌函数 annotation 太长，可以放到函数体里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### 常量的约束"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "`T.const` 创建的常量 annotation 只要要被直接使用一次，否则会报错"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### 动态维度的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "如果想要 Tensor 的 annotation 类型某个参数变化，建议改成 T.ptr + T.match_buffer 格式。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### 带默认参数的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "类似 `T.float32` 标注的标量可以带默认参数"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## 参数匹配的 Overhead"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "EagerJIT overhead 很小，每个 constant 添加约 200ns 的 overhead\n",
+    "* 200ns 大约是从 torch.Tensor 的 shape/stride 中拿参数的 ffi call 的代价"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## 编译与并行编译"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "Eager JIT 和原来的 jit（即 LazyJIT） 都支持并行编译\n",
+    "\n",
+    "为了防止 torch.tensor 白白浪费内存，可以使用 T.Tensor 来创建 placeholder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## 更便利的 Macro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang 的 macro 现在已经升级：\n",
+    "\n",
+    "1. 允许用 `T.Ref` 作为 annotation，这类似与 C++ 的引用传递\n",
+    "2. 允许返回多个值\n",
+    "3. 允许嵌套，递归"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### T.Ref 传递引用\n",
+    "\n",
+    "T.Ref 传递的引用可以 var 也可以是 Buffer 的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # 支持常量 index\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # 也支持变量 index\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### 当作参数传递\n",
+    "\n",
+    "你可以把 macro 当做参数传递"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro 递归\n",
+    "\n",
+    "虽然不知道有没有这种需求，但 macro 是可以递归的，终止条件要求编译期间确定"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro 返回多个值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index bc9bb4df5..3d142ed54 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -1,9 +1,7 @@
 import argparse
-import itertools
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.autotuner import AutoTuner
 
 
 def ref_program(x, y):
@@ -12,10 +10,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -24,7 +20,7 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -32,53 +28,34 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
     return elem_add
 
 
-def get_configs(M, N):
-    block_M = [64, 128, 256]
-    block_N = [64, 128, 256]
-    threads = [64, 128, 256]
-    configs = list(itertools.product(block_M, block_N, threads))
-    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
-
-
-def get_best_config(M, N):
+def main(M=1024, N=1024, use_autotune=False):
+    a = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
-    def kernel(block_M=None, block_N=None, threads=None):
-        return elementwise_add(M, N, block_M, block_N, "float32", "float32", threads)
+    kernel = elementwise_add(M, N, block_M=32, block_N=32, threads=128, in_dtype=T.float32, out_dtype=T.float32)
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N)).set_compile_args(
-            out_idx=[-1],
-            target="cuda",
-        ).set_profile_args(
-            supply_type=tilelang.TensorSupplyType.Auto,
-            ref_prog=ref_program,
-            skip_check=False,
-        )
-    return autotuner.run(warmup=3, rep=20)
+    out = kernel(a, b)
+    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
 
 
-def main():
+def run_regression_perf():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--m", type=int, default=1024)
-    parser.add_argument("--n", type=int, default=1024)
-    parser.add_argument("--use_autotune", action="store_true", default=False)
+    parser.add_argument("--m", type=int, default=4096)
+    parser.add_argument("--n", type=int, default=4096)
     args, _ = parser.parse_known_args()
     M, N = args.m, args.n
-
     a = torch.randn(M, N, dtype=torch.float32, device="cuda")
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    config = {"block_M": 32, "block_N": 32, "threads": 128}
+    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+    from tilelang.profiler import do_bench
 
-    if args.use_autotune:
-        result = get_best_config(M, N)
-        kernel = result.kernel
-    else:
-        # Default config
-        config = {"block_M": 32, "block_N": 32, "threads": 128}
-        kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
-
-    out = kernel(a, b)
-    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
+    return do_bench(lambda: kernel(a, b), backend="cupti")
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--m", type=int, default=1024)
+    parser.add_argument("--n", type=int, default=1024)
+    args, _ = parser.parse_known_args()
+    main(args.m, args.n)
diff --git a/examples/elementwise/regression_example_elementwise.py b/examples/elementwise/regression_example_elementwise.py
new file mode 100644
index 000000000..261202a56
--- /dev/null
+++ b/examples/elementwise/regression_example_elementwise.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_elementwise_add
+
+
+def regression_example_elementwise_add():
+    tilelang.testing.process_func(example_elementwise_add.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/flash_attention/README.md b/examples/flash_attention/README.md
index be11a8dc6..355ed7325 100644
--- a/examples/flash_attention/README.md
+++ b/examples/flash_attention/README.md
@@ -34,8 +34,6 @@ def flash_attention(
         scores_sum = T.alloc_fragment([block_M], accum_dtype)
         logsum = T.alloc_fragment([block_M], accum_dtype)
 
-        # Annotate layout for Q_shared, e.g., use a swizzled layout to optimize memory access
-        T.annotate_layout({Q_shared: tl.layout.make_swizzled_layout(Q_shared)})
 
         # Copy a block of Q from global memory to Q_shared
         T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
@@ -77,6 +75,8 @@ def flash_attention(
 
             # Compute the maximum value per row on dimension 1 (block_N)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             # Compute the factor by which we need to rescale previous partial sums
             for i in T.Parallel(block_M):
@@ -106,4 +106,4 @@ def flash_attention(
 
         # Write back the final output block from acc_o to the Output buffer
         T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
-```
\ No newline at end of file
+```
diff --git a/examples/flash_attention/bert_padding.py b/examples/flash_attention/bert_padding.py
index 7058fd773..15c4097ce 100644
--- a/examples/flash_attention/bert_padding.py
+++ b/examples/flash_attention/bert_padding.py
@@ -6,7 +6,6 @@
 
 
 class IndexFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -15,9 +14,7 @@ def forward(ctx, input, indices):
         second_dim = other_shape.numel()
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0,
-            repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
+        return torch.gather(rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -40,14 +37,12 @@ def backward(ctx, grad_output):
 
 
 class IndexPutFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, values, indices, first_axis_dim):
         ctx.save_for_backward(indices)
         assert indices.ndim == 1
         assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        output = torch.zeros(first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         output[indices] = values
         # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
@@ -66,7 +61,6 @@ def backward(ctx, grad_output):
 
 
 class IndexFirstAxisResidual(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -128,7 +122,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
     The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
+
     For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
         ```
         [
@@ -177,9 +171,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(
-        seqlen, device=length.device, dtype=length.dtype).expand(len(length),
-                                                                 seqlen) < length.unsqueeze(1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
     real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index 907a121d2..801927faf 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -6,25 +6,27 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -39,26 +41,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +73,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,81 +106,74 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_qk]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -197,35 +193,35 @@ def flash_bwd(
             dk_shared = T.alloc_shared([block_M, dim_qk], accum_dtype)
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -237,49 +233,41 @@ def flash_bwd(
                 for i, j in T.Parallel(block_N, dim_qk):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        heads,
-                        seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -299,37 +287,35 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -342,16 +328,15 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -369,7 +354,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -386,17 +374,8 @@ def maybe_contiguous(x):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -409,17 +388,8 @@ def maybe_contiguous(x):
             dv = dv.to(torch.float16)
         else:
             kernel = flashattn_bwd_split(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -441,53 +411,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -504,7 +466,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -522,19 +484,61 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD_QK = 192
+    D_HEAD_V = 128
+    groups = 16
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    head_kv = H // groups
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD_QK, device=device, dtype=torch.half)
+    K = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
+    V = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.half)
+    O = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
+    dO = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
+        kernel = flashattn_bwd_split(
+            BATCH,
+            H,
+            N_CTX,
+            D_HEAD_QK,
+            D_HEAD_V,
+            causal,
+            block_M=128,
+            block_N=32,
+            threads=256,
+            num_stages=2,
+            groups=groups,
+        )
+    dQ = torch.zeros_like(Q, dtype=torch.float32)
+    dK = torch.zeros(groups, BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.float16)
+    dV = torch.zeros(groups, BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -546,5 +550,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index 615c2e191..4920d8cf0 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -5,29 +5,29 @@
 from tilelang.contrib import nvcc
 import argparse
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -42,28 +42,27 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
-            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            T.fill(scores_max, T.cast(-1e30, accum_dtype))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.cast(-1e30, accum_dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -77,29 +76,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -108,12 +109,12 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
@@ -124,12 +125,14 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
@@ -137,64 +140,55 @@ def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bz, bx * blk:(bx + 1) * blk, by, :], dQ_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.copy(dQ[bz, bx * blk : (bx + 1) * blk, by, :], dQ_out[bz, bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(seq_len, blk), head_kv, batch, threads=128) as (bx, by, bz):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bz, bx * blk:(bx + 1) * blk, by, :], dK_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
-            T.copy(dV[bz, bx * blk:(bx + 1) * blk, by, :], dV_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bz, bx * blk : (bx + 1) * blk, by, :], dK_out[bz, bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bz, bx * blk : (bx + 1) * blk, by, :], dV_out[bz, bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -215,37 +209,29 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -255,53 +241,43 @@ def flash_bwd(
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared, use_tma=True)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared, use_tma=True)
             T.copy(dv, dv_shared)
-            T.atomic_add(
-                dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
             T.copy(dk, dk_shared)
-            T.atomic_add(
-                dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split_novarlen(batch,
-                                 heads,
-                                 seq_len,
-                                 dim_qk,
-                                 dim_v,
-                                 is_causal,
-                                 block_M,
-                                 block_N,
-                                 threads=256,
-                                 num_stages=2,
-                                 groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split_novarlen(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -321,37 +297,35 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -364,16 +338,15 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -391,7 +364,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -403,22 +379,12 @@ def maybe_contiguous(x):
         block_M = 128
         block_N = 32
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
-        mod_post = flashattn_bwd_postprocess(BATCH, H, HEAD_KV, N_CTX, D_HEAD_QK, D_HEAD_V)
         delta = mod_prep(o, do)
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -426,20 +392,11 @@ def maybe_contiguous(x):
             dk = torch.zeros(shape_k, dtype=torch.float32, device=q.device)
             dv = torch.zeros(shape_v, dtype=torch.float32, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split_novarlen(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
+            mod_post = flashattn_bwd_postprocess(BATCH, H, HEAD_KV, N_CTX, D_HEAD_QK, D_HEAD_V)
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -447,8 +404,7 @@ def maybe_contiguous(x):
             dk = torch.empty(shape_k, dtype=torch.float16, device=q.device)
             dv = torch.empty(shape_v, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         return dq, dk, dv, None, None, None
@@ -462,53 +418,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -525,7 +473,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -548,17 +496,15 @@ def run1():
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -570,5 +516,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 88f2d81e1..b09eec00c 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -7,57 +7,44 @@
 from einops import rearrange, repeat
 from bert_padding import pad_input, unpad_input
 
-# tilelang.disable_cache()
-
 
 def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     assert mode in ["full", "random", "third"]
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
 @tilelang.jit(
-    out_idx=[5, 6], pass_configs={
+    out_idx=[5, 6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_fwd(batch,
-                  total_q,
-                  total_kv,
-                  N_CTX,
-                  heads,
-                  max_seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     o_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -79,8 +66,6 @@ def flash_fwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-
             for i, d in T.Parallel(block_M, dim_qk):
                 if bx * block_M + i < q_current_seqlen:
                     Q_shared[i, d] = Q[q_start_idx + bx * block_M + i, by, d]
@@ -91,7 +76,7 @@ def flash_fwd(
             T.fill(logsum, 0.0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
-            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
+            T.fill(scores_max, T.cast(-1e30, accum_dtype))
             loop_range = T.ceildiv(k_current_seqlen, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
                 for i, d in T.Parallel(block_N, dim_qk):
@@ -102,15 +87,17 @@ def flash_fwd(
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i < q_current_seqlen and
-                                                      k * block_N + j < k_current_seqlen), 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen),
+                            0,
+                            T.cast(-1e30, accum_dtype),
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(
-                            bx * block_M + i < q_current_seqlen and
-                            k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30))
+                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.cast(-1e30, accum_dtype)
+                        )
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, d in T.Parallel(block_N, dim_v):
                     if k * block_N + i < k_current_seqlen:
@@ -119,6 +106,8 @@ def flash_fwd(
                         V_shared[i, d] = 0.0
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -146,21 +135,23 @@ def flash_fwd(
 
 
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [total_q, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -199,12 +190,14 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
@@ -212,70 +205,62 @@ def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(total_q, blk), heads, threads=128) as (bx, by):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bx * blk:(bx + 1) * blk, by, :], dQ_out[bx * blk:(bx + 1) * blk, by, :])
+            T.copy(dQ[bx * blk : (bx + 1) * blk, by, :], dQ_out[bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(total_kv, blk), head_kv, threads=128) as (bx, by):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bx * blk:(bx + 1) * blk, by, :], dK_out[bx * blk:(bx + 1) * blk, by, :])
-            T.copy(dV[bx * blk:(bx + 1) * blk, by, :], dV_out[bx * blk:(bx + 1) * blk, by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bx * blk : (bx + 1) * blk, by, :], dK_out[bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bx * blk : (bx + 1) * blk, by, :], dV_out[bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             total_q,
-                             total_kv,
-                             N_CTX,
-                             heads,
-                             max_seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     do_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -301,58 +286,45 @@ def flash_bwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
-
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
 
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 # dsT: (block_kv, block_q)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
                 T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -362,49 +334,40 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
                 T.atomic_add(
-                    dQ[q_start_idx + k_base * block_N:q_start_idx + k_base * block_N + block_N,
-                       bx, :],
+                    dQ[q_start_idx + k_base * block_N : q_start_idx + k_base * block_N + block_N, bx, :],
                     dq_shared,
                     memory_order="relaxed",
-                    use_tma=True)
+                    use_tma=True,
+                )
 
             T.copy(dv, dv_shared)
             T.atomic_add(
-                dV[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dV[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dv_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
             T.copy(dk, dk_shared)
             T.atomic_add(
-                dK[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dK[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dk_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        total_q,
-                        total_kv,
-                        N_CTX,
-                        heads,
-                        max_seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -412,25 +375,24 @@ def flashattn_bwd_split(batch,
     do_shape = [total_q, heads, dim_v]
     dk_shape = [groups, total_kv, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, total_kv, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -455,59 +417,52 @@ def flash_bwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
                 # Note: The padding zero of varlen should be considered in T.copy
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
 
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
 
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -518,62 +473,37 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 for i, j in T.Parallel(block_N, dim_qk):
                     if k_base * block_N + i < q_current_seqlen:
-                        T.atomic_add(
-                            dQ[q_start_idx + k_base * block_N + i, bx, j],
-                            dq[i, j],
-                            memory_order="relaxed")
+                        T.atomic_add(dQ[q_start_idx + k_base * block_N + i, bx, j], dq[i, j], memory_order="relaxed")
 
             T.copy(dv, dv_shared)
-            T.copy(
-                dv_shared,
-                dV[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(
-                dk_shared,
-                dK[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dk_shared, dK[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
-    def forward(ctx,
-                q,
-                k,
-                v,
-                seqlens_q,
-                seqlens_k,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                causal,
-                groups=1,
-                use_atomic=True):
+    def forward(
+        ctx, q, k, v, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups=1, use_atomic=True
+    ):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
         D_HEAD_V = v.shape[-1]
         block_M = 128
         block_N = 64
-        q_unpad, indices_q, _, _ = unpad_input(
-            q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
-        k_unpad, indices_k, _, _ = unpad_input(
-            k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
-        v_unpad, _, _, _ = unpad_input(
-            v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        q_unpad, indices_q, _, _ = unpad_input(q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        k_unpad, indices_k, _, _ = unpad_input(k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        v_unpad, _, _, _ = unpad_input(v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
 
         total_q = q_unpad.shape[0]
         total_kv = k_unpad.shape[0]
 
-        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V,
-                            causal, block_M, block_N, groups)
-        o_unpad, lse = mod(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
+        kernel = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
+        o_unpad, lse = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
         o = pad_input(o_unpad, indices_q, BATCH, N_CTX)
-        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k,
-                              cu_seqlens_q, cu_seqlens_k)
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k)
         ctx.batch = BATCH
         ctx.causal = causal
         ctx.use_atomic = use_atomic
@@ -588,8 +518,7 @@ def backward(ctx, do):
         N_CTX = do.shape[1]
         q, k, v, o, lse_clone, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
         # lse_clone = lse.clone()
-        do_unpad, _, _, _ = unpad_input(
-            do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        do_unpad, _, _, _ = unpad_input(do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
         total_q, H, D_HEAD_QK = q.shape
         total_kv, HEAD_KV, D_HEAD_V = v.shape
         groups = H // HEAD_KV
@@ -604,7 +533,6 @@ def maybe_contiguous(x):
         block_M = 128
         block_N = 32
         mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, N_CTX, ctx.max_seqlen_q, D_HEAD_V)
-        mod_post = flashattn_bwd_postprocess(total_q, total_kv, H, HEAD_KV, D_HEAD_QK, D_HEAD_V)
         delta = mod_prep(o, do, cu_seqlens_q)
 
         if ctx.use_atomic:
@@ -622,12 +550,12 @@ def maybe_contiguous(x):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.zeros_like(k, dtype=torch.float32)
             dv = torch.zeros_like(v, dtype=torch.float32)
             kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split(
                 BATCH,
@@ -643,13 +571,14 @@ def maybe_contiguous(x):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
+            mod_post = flashattn_bwd_postprocess(total_q, total_kv, H, HEAD_KV, D_HEAD_QK, D_HEAD_V)
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.empty(groups, *k.shape, dtype=torch.float16, device=q.device)
             dv = torch.empty(groups, *v.shape, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         dq = pad_input(dq, ctx.indices_q, BATCH, N_CTX)
@@ -668,15 +597,13 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
     # HQ = HKV * groups
     # To handle precision issue
     Q, K, V = Q.float(), K.float(), V.float()
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if padding_mask is not None:
         scores.masked_fill_(rearrange(~padding_mask, "b s -> b 1 1 s"), float("-inf"))
@@ -684,41 +611,35 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     if padding_mask is not None:
         output.masked_fill_(rearrange(~padding_mask, "b s -> b s 1 1"), 0.0)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     padding_mask = generate_random_padding_mask(N_CTX, BATCH, "cuda", mode="random")
     seqlens_q = padding_mask.sum(dim=-1, dtype=torch.int32)
     cu_seqlens_q = F.pad(torch.cumsum(seqlens_q, dim=0, dtype=torch.int32), (1, 0))
@@ -727,8 +648,7 @@ def main(BATCH: int = 1,
     # In training backward pass, seqlens_k should be the same as seqlens_q
     seqlens_k, cu_seqlens_k, max_seqlen_k = seqlens_q, cu_seqlens_q, max_seqlen_q
 
-    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q,
-                  max_seqlen_k, causal, groups, use_atomic)
+    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
     dK, K.grad = K.grad.clone(), None
@@ -765,22 +685,72 @@ def run1():
     )
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD_QK = 192
+    D_HEAD_V = 128
+    groups = 16
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    total_q = BATCH * N_CTX
+    total_kv = BATCH * N_CTX
+    head_kv = H // groups
+    Q = torch.randn(total_q, H, D_HEAD_QK, device=device, dtype=torch.half)
+    K = torch.randn(total_kv, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
+    V = torch.randn(total_kv, head_kv, D_HEAD_V, device=device, dtype=torch.half)
+    O = torch.randn(total_q, H, D_HEAD_V, device=device, dtype=torch.half)
+    dO = torch.randn(total_q, H, D_HEAD_V, device=device, dtype=torch.half)
+    cu_seqlens_q = torch.arange(0, (BATCH + 1) * N_CTX, N_CTX, device=device, dtype=torch.int32)
+    cu_seqlens_k = cu_seqlens_q
+    max_seqlen_q = N_CTX
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, N_CTX, max_seqlen_q, D_HEAD_V)
+        kernel = flashattn_bwd_split(
+            BATCH,
+            total_q,
+            total_kv,
+            N_CTX,
+            H,
+            max_seqlen_q,
+            D_HEAD_QK,
+            D_HEAD_V,
+            causal,
+            block_M=128,
+            block_N=32,
+            threads=256,
+            num_stages=2,
+            groups=groups,
+        )
+    dQ = torch.zeros_like(Q, dtype=torch.float32)
+    dK = torch.zeros(groups, total_kv, head_kv, D_HEAD_QK, device=device, dtype=torch.float16)
+    dV = torch.zeros(groups, total_kv, head_kv, D_HEAD_V, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO, cu_seqlens_q)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, cu_seqlens_q, cu_seqlens_k, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     arch = nvcc.get_target_compute_version()
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
     # Can be set to True/False for testing
     args.causal = True
@@ -794,5 +764,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index ed07e7d9d..2da64472c 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -6,25 +6,27 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -39,26 +41,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +73,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,50 +106,42 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  threads=256,
-                  num_stages=2,
-                  groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -167,45 +162,30 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -217,18 +197,17 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -246,7 +225,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -260,18 +242,7 @@ def maybe_contiguous(x):
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
         delta = mod_prep(o, do)
 
-        kernel = flashattn_bwd(
-            BATCH,
-            H,
-            N_CTX,
-            D_HEAD_QK,
-            D_HEAD_V,
-            ctx.causal,
-            block_M,
-            block_N,
-            threads=256,
-            num_stages=2,
-            groups=groups)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups)
         shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
         shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
         shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -294,52 +265,36 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False):
+def main(BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -356,7 +311,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -374,15 +329,34 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False
+):
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+
+    head_kv = H // groups
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    O = attention(Q, K, V, causal, groups)
+
+    def run1():
+        O.backward(dO, retain_graph=True)
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run1, warmup=500, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 4d9d06a4f..e884a8158 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -9,7 +9,6 @@
 
 
 class FlashAttentionTuneSpace:
-
     def __init__(
         self,
         block_sizes=(64, 128, 256),
@@ -40,7 +39,7 @@ def get_configs(user_config=None):
             warp_M = block_M // warp_count
             warp_N = block_N // warp_count
 
-            if (warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0):
+            if warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0:
                 continue
 
             shared_mem = 2 * config.dtype_bytes * config.dim * (block_M + block_N)
@@ -48,114 +47,38 @@ def get_configs(user_config=None):
                 continue
 
             for num_stages in config.num_stages_range:
-                valid_configs.append({
-                    "block_M": block_M,
-                    "block_N": block_N,
-                    "num_stages": num_stages,
-                    "threads": threads,
-                })
+                valid_configs.append(
+                    {
+                        "block_M": block_M,
+                        "block_N": block_N,
+                        "num_stages": num_stages,
+                        "threads": threads,
+                    }
+                )
     return valid_configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              groups=1,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, groups=1, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -171,25 +94,49 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -199,50 +146,34 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 16,
-         tune: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=64,
-            block_N=64,
-            num_stages=2,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -264,14 +195,22 @@ def main(batch: int = 1,
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 1c1fc12d2..73a725d9f 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -24,9 +24,11 @@ def get_configs():
     rep=10,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,90 +41,19 @@ def flashattn(
     num_stages=0,
     threads=128,
 ):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -138,30 +69,55 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -171,23 +127,21 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -205,18 +159,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -238,14 +182,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 64,
+    seq_len: int = 4096,
+    dim: int = 128,
+    is_causal: bool = False,
+    groups: int = 16,
+):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index db16e1586..0e8e21c43 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -4,91 +4,36 @@
 import tilelang
 import tilelang.language as T
 import tilelang.testing
-from einops import rearrange, repeat
 from tilelang.profiler import do_bench
 from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
-def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),
-        upcast=True,
-):
-    if causal:
-        window_size = (window_size[0], 0)
-    dtype_og = q.dtype
-    if upcast:
-        q, k, v = q.float(), k.float(), v.float()
-    b, T, Hq, D = q.shape
-    S = k.shape[1]
-    scale = (1.0 / D)**0.5
-    k = repeat(k, "b s h d -> b s (h g) d", g=Hq // k.shape[2])
-    v = repeat(v, "b s h d -> b s (h g) d", g=Hq // v.shape[2])
-    scores = torch.einsum("bthd,bshd->bhts", q, k)
-    left, right = window_size
-    left = S if left is None or left < 0 else int(left)
-    right = S if right is None or right < 0 else int(right)
-    t_idx = torch.arange(T, device=scores.device)[:, None]
-    s_idx = torch.arange(S, device=scores.device)[None, :]
-    visible_ts = (s_idx >= (t_idx - left)) & (s_idx <= (t_idx + right))
-    visible_mask = visible_ts.unsqueeze(0).unsqueeze(0)
-    if key_padding_mask is not None:
-        k_keep = rearrange(key_padding_mask, "b s -> b 1 1 s")
-        visible_mask = visible_mask & k_keep
-    neg_inf = torch.finfo(scores.dtype).min
-    scores = scores * scale
-    scores = scores.masked_fill(~visible_mask, neg_inf)
-    attention = torch.softmax(scores, dim=-1).to(v.dtype)
-    if query_padding_mask is not None:
-        q_keep = rearrange(query_padding_mask, "b t -> b 1 t 1")
-        attention = attention.masked_fill(~q_keep, 0.0)
-    output = torch.einsum("bhts,bshd->bthd", attention, v)
-    if query_padding_mask is not None:
-        output = output.masked_fill(rearrange(~query_padding_mask, "b t -> b t 1 1"), 0.0)
-    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
-
-
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              groups,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, groups, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
     o_shape = [UQ, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -102,11 +47,6 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-            })
-
             batch_idx = bz
             head_idx = by
             kv_head_idx = head_idx // groups
@@ -119,43 +59,42 @@ def main(
             q_current_seqlen = q_end_idx - q_start_idx
             kv_current_seqlen = k_end_idx - kv_start_idx
 
-            T.copy(
-                Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M, head_idx, :],
-                Q_shared)
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
             loop_range = (
-                T.min(
-                    T.ceildiv(q_current_seqlen +
-                              (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
-                if is_causal else T.ceildiv(kv_current_seqlen, block_N))
+                T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], K_shared)
+                T.copy(K_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], K_shared)
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i,
-                              j] = T.if_then_else((bx * block_M + i < k * block_N + j) or
-                                                  (bx * block_M + i >= q_current_seqlen or
-                                                   k * block_N + j >= kv_current_seqlen), -1e9, 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i + offset < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= kv_current_seqlen), -1e9,
-                                                     0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
                 for i in T.Parallel(block_M):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
@@ -171,16 +110,15 @@ def main(
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] *= scores_scale[i]
 
-                T.copy(
-                    V_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], V_shared)
+                T.copy(V_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], V_shared)
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, O_shared)
+                # When sq > skv, some tokens can see nothing
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
 
+            T.copy(acc_o, O_shared)
             for i, d in T.Parallel(block_M, dim):
                 if bx * block_M + i < q_current_seqlen:
                     Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
@@ -188,13 +126,9 @@ def main(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         q_seqlen: int = 2048,
-         k_seqlen: int = 2048,
-         dim: int = 128,
-         groups: int = 16,
-         is_causal: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, q_seqlen: int = 2048, k_seqlen: int = 2048, dim: int = 128, groups: int = 16, is_causal: bool = False
+):
     assert heads % groups == 0, "heads must be divisible by groups"
 
     flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
@@ -232,55 +166,46 @@ def main(batch: int = 1,
         output_pad_fn,
         _,
         _,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]
     UKV = k_unpad.shape[0]
 
-    kernel = flashattn(
-        batch,
-        groups,
-        UQ,
-        UKV,
-        heads,
-        dim,
-        is_causal,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256)
+    kernel = flashattn(batch, groups, UQ, UKV, heads, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
 
-    out_ref, _ = attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=query_padding_mask,
-        key_padding_mask=key_padding_mask,
+    import flash_attn
+
+    fa_out_unpad = flash_attn.flash_attn_varlen_func(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        0.0,
         causal=is_causal,
     )
-    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
+    fa_out = output_pad_fn(fa_out_unpad)
+    torch.testing.assert_close(out, fa_out, rtol=1e-2, atol=1e-2)
+
     print("All checks passed.✅")
-    latency = do_bench(
-        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q),
-        _n_warmup=5,
-        _n_repeat=5)
+    latency = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q), _n_warmup=5, _n_repeat=5)
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='query heads')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument('--q_seqlen', type=int, default=2048, help='query sequence length')
-    parser.add_argument('--k_seqlen', type=int, default=2048, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='head dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal attention')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups,
-         args.is_causal)
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal)
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 1595ae764..34e8fefc5 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,29 +40,28 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             # T.copy(Q_shared, Q_local)
             # for i, j in T.Parallel(block_M, dim):
             #     Q_local[i, j] *= scale
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -74,29 +75,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -105,68 +108,71 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -190,36 +196,36 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -232,14 +238,13 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, bx, k * block_N + i, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, H, N_CTX, D_HEAD = q.shape
@@ -281,15 +286,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(2)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -304,9 +309,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -345,12 +348,43 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 16
+    N_CTX = 512
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(0)
+    block_M = 64
+    block_N = 64 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float16)
+    dV = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd.py b/examples/flash_attention/example_mha_bwd_bshd.py
similarity index 65%
rename from examples/flash_attention/example_mha_bwd.py
rename to examples/flash_attention/example_mha_bwd_bshd.py
index 543c2c0e7..fc8328fa4 100644
--- a/examples/flash_attention/example_mha_bwd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,25 +40,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -70,29 +72,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -101,68 +105,71 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -186,33 +193,36 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-            })
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -225,14 +235,13 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -274,15 +283,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -297,9 +306,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -336,12 +343,43 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 16
+    N_CTX = 512
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    block_M = 64
+    block_N = 64 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float16)
+    dV = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
similarity index 64%
rename from examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
rename to examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index 7ad417ef5..c0fe4e33d 100644
--- a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -37,27 +39,26 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -71,29 +72,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -102,37 +105,39 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=256) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -157,47 +162,34 @@ def flash_bwd(
             dk_shared = T.alloc_shared([block_M, dim], dtype)
             dq_shared = T.alloc_shared([block_N, dim], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.wait_wgmma(0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -208,17 +200,16 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -260,15 +251,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -283,9 +274,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -305,7 +294,7 @@ def main(
     assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -321,12 +310,44 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(0)
+    block_M = 128
+    block_N = 128 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros_like(Q, dtype=torch.float16)
+    dV = torch.zeros_like(Q, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index f07f7a618..400736541 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -15,107 +15,27 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                q_idx = bx * block_M + i + past_len
-                k_idx = k * block_N + j
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,43 +51,69 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        q_idx = bx * block_M + i + past_len
+                        k_idx = k * block_N + j
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -185,18 +131,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -219,14 +155,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 64,
+    is_causal: bool = False,
+    tune: bool = False,
+):
+    kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=1, help='heads')
-    parser.add_argument('--seq_q', type=int, default=256, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=256, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=1, help="heads")
+    parser.add_argument("--seq_q", type=int, default=256, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=256, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal", default=False)
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index 26167b34b..90514f762 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -15,107 +15,27 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                q_idx = bx * block_M + i + past_len
-                k_idx = k * block_N + j
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,48 +51,75 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        q_idx = bx * block_M + i + past_len
+                        k_idx = k * block_N + j
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -190,18 +137,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -224,14 +161,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    tune: bool = False,
+):
+    kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index 6a1f707e5..e584971c0 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -15,100 +15,23 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,40 +47,64 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -174,17 +121,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -206,13 +144,19 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 8, heads: int = 32, seq_len: int = 4096, dim: int = 128, is_causal: bool = False):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index 3928db4c3..d6e1490c9 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -15,100 +15,23 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,45 +47,70 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -179,17 +127,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -211,13 +150,19 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 8, heads: int = 32, seq_len: int = 4096, dim: int = 128, is_causal: bool = False):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f381e900a..0f3610b11 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -4,109 +4,51 @@
 import tilelang.language as T
 import tilelang.testing
 import argparse
+from tilelang.profiler import do_bench
+from tilelang.autotuner import set_autotune_inputs, autotune
 
 import torch
-from einops import rearrange, repeat
 from varlen_utils import generate_random_padding_mask, generate_qkv
+import itertools
 
 
-def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),  # -1 means infinite window size
-        upcast=True,
-):
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, head_dim)
-        k: (batch_size, seqlen_k, nheads_k, head_dim)
-        v: (batch_size, seqlen_k, nheads_k, head_dim)
-        query_padding_mask: (batch_size, seqlen_q)
-        key_padding_mask: (batch_size, seqlen_k)
-        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
-        dropout_p: float
-        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
-        causal: whether to apply causal masking
-        window_size: (int, int), left and right window size
-        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
-            output back to fp16/bf16.
-        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
-            without changing the math. This is to estimate the numerical error from operation
-            reordering.
-    Output:
-        output: (batch_size, seqlen_q, nheads, head_dim)
-        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
-    """
-    if causal:
-        window_size = (window_size[0], 0)
-    dtype_og = q.dtype
-    if upcast:
-        q, k, v = q.float(), k.float(), v.float()
-    dim = q.shape[-1]
-    scale = (1.0 / dim)**0.5  # log2(e)
-    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
-    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
-    scores = torch.einsum("bthd,bshd->bhts", q, k)
-    if key_padding_mask is not None:
-        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
-        # scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0)
-    scores = scores * scale
-    attention = torch.softmax(scores, dim=-1).to(v.dtype)
-
-    # We want to mask here so that the attention matrix doesn't have any NaNs
-    # Otherwise we'll get NaN in dV
-    if query_padding_mask is not None:
-        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
-    output = torch.einsum("bhts,bshd->bthd", attention, v)
-    if query_padding_mask is not None:
-        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
-    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+def get_configs():
+    iter_params = dict(block_M=[64, 128], block_N=[64, 128], num_stages=[0, 1, 2, 3], threads=[128, 256])
+    return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
+@autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=32):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [UQ, heads, dim]
     k_shape = [UKV, heads, dim]
     v_shape = [UKV, heads, dim]
     o_shape = [UQ, heads, dim]
 
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(k_shape, dtype),
-            V_unpad: T.Tensor(v_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(k_shape, dtype),
+        V_unpad: T.Tensor(v_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
-            Q_shared = T.alloc_shared([block_M, dim], dtype, "shared")
-            K_shared = T.alloc_shared([block_N, dim], dtype, "shared")
-            V_shared = T.alloc_shared([block_N, dim], dtype, "shared")
-            O_shared = T.alloc_shared([block_M, dim], dtype, "shared")
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
             acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
             acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
             acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
@@ -120,46 +62,46 @@ def main(
             head_idx = by
 
             q_start_idx = cu_seqlens_q[batch_idx]
-            k_start_idx = cu_seqlens_k[batch_idx]
-            v_start_idx = cu_seqlens_k[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
             q_end_idx = cu_seqlens_q[batch_idx + 1]
-            k_end_idx = cu_seqlens_k[batch_idx + 1]
-            v_end_idx = cu_seqlens_k[batch_idx + 1]
+            kv_end_idx = cu_seqlens_k[batch_idx + 1]
 
             q_current_seqlen = q_end_idx - q_start_idx
-            k_current_seqlen = k_end_idx - k_start_idx
-            v_current_seqlen = v_end_idx - v_start_idx
+            kv_current_seqlen = kv_end_idx - kv_start_idx
 
-            for i, d in T.Parallel(block_M, dim):
-                if bx * block_M + i < q_current_seqlen:
-                    Q_shared[i, d] = Q_unpad[q_start_idx + bx * block_M + i, head_idx, d]
-                else:
-                    Q_shared[i, d] = 0
+            T.copy(
+                Q_unpad[q_start_idx + bx * block_M : q_start_idx + bx * block_M + block_M, head_idx, :], Q_shared
+            )  # OOB positions will be handled below
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(k_current_seqlen, block_N)
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            loop_range = (
+                T.min(T.ceildiv(offset + (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 # Q * K
-                for i, d in T.Parallel(block_N, dim):
-                    if k * block_N + i < k_current_seqlen:
-                        K_shared[i, d] = K_unpad[k_start_idx + k * block_N + i, head_idx, d]
-                    else:
-                        K_shared[i, d] = 0
+                T.copy(
+                    K_unpad[kv_start_idx + k * block_N : kv_start_idx + k * block_N + block_N, head_idx, :], K_shared
+                )  # OOB positions will be handled below
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i + offset < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -167,6 +109,8 @@ def main(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -189,18 +133,17 @@ def main(
                     acc_o[i, j] *= scores_scale[i]
 
                 # V * softmax(Q * K)
-                for i, d in T.grid(block_N, dim):
-                    if k * block_N + i < v_current_seqlen:
-                        V_shared[i, d] = V_unpad[v_start_idx + k * block_N + i, head_idx, d]
-                    else:
-                        V_shared[i, d] = 0
+                T.copy(
+                    V_unpad[kv_start_idx + k * block_N : kv_start_idx + k * block_N + block_N, head_idx, :], V_shared
+                )  # OOB positions' weights are 0
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, O_shared)
+                # When sq > skv, some tokens can see nothing
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
 
+            T.copy(acc_o, O_shared)
             for i, d in T.Parallel(block_M, dim):
                 if bx * block_M + i < q_current_seqlen:
                     Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
@@ -208,19 +151,17 @@ def main(
     return main
 
 
-def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
+def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128, causal: bool = False, tune: bool = False):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
 
     tilelang.testing.set_random_seed(0)
 
-    causal = False
     if causal:
         total_flops *= 0.5
 
     dtype = torch.float16
     device = torch.device("cuda")
-    window_size = (-1, -1)
 
     q = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
     k = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
@@ -240,30 +181,23 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
         k,
         v,
         output_pad_fn,
-        dq_pad_fn,
-        dk_pad_fn,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]  # unpadded query length
-    UK = k_unpad.shape[0]  # unpadded key length
     UKV = k_unpad.shape[0]  # unpadded query key length
 
-    kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    if tune:
+        with set_autotune_inputs(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q):
+            kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    else:
+        kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=64, block_N=64, num_stages=1, threads=128)
+        # NOTE: (128, 128, 2or3, 256) is recommended for Hopper
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
 
-    out_ref, _ = attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask,
-        key_padding_mask,
-        causal=causal,
-    )
-    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
-
     import flash_attn
 
     fla_out_unpad = flash_attn.flash_attn_varlen_func(
@@ -282,13 +216,67 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
 
     print("All checks passed.✅")
 
+    # benchmark
+    t = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q))
+    print(f"Tilelang time: {t} ms")
+    print(f"Tilelang: {total_flops / t * 1e-9} TFlops")
+    t = do_bench(
+        lambda: flash_attn.flash_attn_varlen_func(
+            q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, 0.0, causal=causal
+        )
+    )
+    print(f"FA2 time: {t} ms")
+    print(f"FA2: {total_flops / t * 1e-9} TFlops")
+
+
+def run_regression_perf(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128, causal: bool = False):
+    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
+    total_flops = 2 * flops_per_matmul
+    tilelang.testing.set_random_seed(0)
+    if causal:
+        total_flops *= 0.5
+    dtype = torch.float16
+    device = torch.device("cuda")
+    q = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    k = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    v = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    query_padding_mask = generate_random_padding_mask(seq_len, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seq_len, batch, device, mode="random")
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        dq_pad_fn,
+        dk_pad_fn,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    UQ = q_unpad.shape[0]
+    UKV = k_unpad.shape[0]
+    kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=128, block_N=128, num_stages=2, threads=256)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=2048, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=2048, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", default=False, help="causal attention")
+    parser.add_argument("--tune", action="store_true", default=False, help="tune the kernel")
 
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_len, args.dim)
+    main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/regression_example_flash_attention.py b/examples/flash_attention/regression_example_flash_attention.py
new file mode 100644
index 000000000..8710bbb6e
--- /dev/null
+++ b/examples/flash_attention/regression_example_flash_attention.py
@@ -0,0 +1,74 @@
+import tilelang.testing
+import example_gqa_fwd_bshd
+import example_gqa_fwd_bshd_wgmma_pipelined
+import example_mha_fwd_bhsd
+import example_mha_fwd_bhsd_wgmma_pipelined
+import example_mha_fwd_bshd
+import example_mha_fwd_bshd_wgmma_pipelined
+import example_mha_fwd_varlen
+import example_gqa_bwd_tma_reduce_varlen
+import example_gqa_bwd
+import example_gqa_bwd_wgmma_pipelined
+import example_mha_bwd_bshd
+import example_mha_bwd_bhsd
+import example_mha_bwd_bshd_wgmma_pipelined
+
+
+def regression_example_gqa_bwd_tma_reduce_varlen():
+    tilelang.testing.process_func(example_gqa_bwd_tma_reduce_varlen.run_regression_perf)
+
+
+def regression_example_gqa_bwd():
+    tilelang.testing.process_func(example_gqa_bwd.run_regression_perf)
+
+
+def regression_example_gqa_bwd_wgmma_pipelined():
+    tilelang.testing.process_func(example_gqa_bwd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_bwd_bshd():
+    tilelang.testing.process_func(example_mha_bwd_bshd.run_regression_perf)
+
+
+def regression_example_mha_bwd_bhsd():
+    tilelang.testing.process_func(example_mha_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_bwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_bwd_bshd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_gqa_fwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(
+        example_gqa_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
+    )
+
+
+def regression_example_gqa_fwd_bshd():
+    tilelang.testing.process_func(
+        example_gqa_fwd_bshd.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
+    )
+
+
+def regression_example_mha_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_fwd_bhsd():
+    tilelang.testing.process_func(example_mha_fwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_fwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=32, seq_len=256)
+
+
+def regression_example_mha_fwd_bshd():
+    tilelang.testing.process_func(example_mha_fwd_bshd.run_regression_perf, batch=1, seq_len=256)
+
+
+def regression_example_mha_fwd_varlen():
+    tilelang.testing.process_func(example_mha_fwd_varlen.run_regression_perf, batch=4, heads=16, seq_len=512, dim=64)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index f4932aee9..a74bf071b 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -2,7 +2,7 @@
 
 import example_gqa_bwd
 import example_gqa_bwd_wgmma_pipelined
-import example_mha_bwd
+import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
 import example_mha_fwd_bhsd_wgmma_pipelined
 import example_gqa_fwd_bshd
@@ -10,9 +10,10 @@
 import example_gqa_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
-import example_mha_bwd_wgmma_pipelined
+import example_mha_bwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
 import example_gqa_bwd_tma_reduce_varlen
+import example_gqa_fwd_varlen
 
 
 @tilelang.testing.requires_cuda
@@ -33,7 +34,7 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main(
+    example_mha_bwd_bshd.main(
         BATCH=1,
         H=16,
         N_CTX=512,
@@ -56,20 +57,18 @@ def test_example_mha_bwd_bhsd():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
+    example_mha_bwd_bshd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_gqa_fwd_bshd_wgmma_pipelined():
-    example_gqa_fwd_bshd_wgmma_pipelined.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd_wgmma_pipelined.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
 def test_example_gqa_fwd_bshd():
-    example_gqa_fwd_bshd.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
@@ -96,7 +95,14 @@ def test_example_mha_fwd_bshd():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_varlen():
-    example_mha_fwd_varlen.main(batch=4, heads=16, seq_len=512, dim=64)
+    example_mha_fwd_varlen.main(batch=4, heads=16, seq_len=512, dim=64, causal=False)
+    example_mha_fwd_varlen.main(batch=4, heads=16, seq_len=512, dim=64, causal=True)
+
+
+@tilelang.testing.requires_cuda
+def test_example_gqa_fwd_varlen():
+    example_gqa_fwd_varlen.main(batch=4, heads=16, q_seqlen=512, k_seqlen=512, dim=64, is_causal=False)
+    example_gqa_fwd_varlen.main(batch=4, heads=16, q_seqlen=512, k_seqlen=512, dim=64, is_causal=True)
 
 
 if __name__ == "__main__":
diff --git a/examples/flash_attention/varlen_utils.py b/examples/flash_attention/varlen_utils.py
index 4301215d5..43e21cc3b 100644
--- a/examples/flash_attention/varlen_utils.py
+++ b/examples/flash_attention/varlen_utils.py
@@ -9,22 +9,14 @@ def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
-def generate_qkv(q,
-                 k,
-                 v,
-                 query_padding_mask=None,
-                 key_padding_mask=None,
-                 kvpacked=False,
-                 qkvpacked=False):
+def generate_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False):
     """
     Arguments:
         q: (batch_size, seqlen_q, nheads, d)
@@ -39,15 +31,12 @@ def generate_qkv(q,
 
     if query_padding_mask is not None:
         q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
-        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q
-                                                      )
+        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q)
     else:
         q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
         max_seqlen_q = seqlen_q
-        output_pad_fn = lambda output_unpad: rearrange(
-            output_unpad, "(b s) h d -> b s h d", b=batch_size)
+        output_pad_fn = lambda output_unpad: rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
 
     if key_padding_mask is not None:
         k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
@@ -55,8 +44,7 @@ def generate_qkv(q,
     else:
         k_unpad = rearrange(k, "b s h d -> (b s) h d")
         v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
+        cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
         max_seqlen_k = seqlen_k
 
     if qkvpacked:
@@ -67,8 +55,7 @@ def generate_qkv(q,
         if query_padding_mask is not None:
             dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
         else:
-            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
-                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             qkv_unpad.detach().requires_grad_(),
             cu_seqlens_q,
@@ -84,8 +71,7 @@ def generate_qkv(q,
         if key_padding_mask is not None:
             dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
         else:
-            dkv_pad_fn = lambda dkv_unpad: rearrange(
-                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dkv_pad_fn = lambda dkv_unpad: rearrange(dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             q_unpad.detach().requires_grad_(),
             kv_unpad.detach().requires_grad_(),
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 9ec3a0265..9e6f36017 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -15,18 +15,12 @@
 def get_configs():
     block_N = [64, 128]
     block_H = [64]
-    num_split = [2, 4, 8]
+    num_split = [1, 2, 4, 8]
     num_stages = [1, 2, 3]
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
@@ -42,43 +36,42 @@ def get_heuristic_config() -> Tuple[Dict, int]:
     if sm_version == 89:
         cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=0, threads=128)
     else:
-        cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=2, threads=128)
+        cfg = dict(block_N=128, block_H=64, num_split=8, num_stages=2, threads=128)
     return cfg, sm_version
 
 
 # TODO(lei): fix warp specialized and tma lower pass
 def get_pass_configs():
-    return {
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    }
+    return {tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[6], pass_configs=get_pass_configs())
-def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages,
-              threads):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [batch, seqlen_kv, groups, dim]
     shape_v = [batch, seqlen_kv, groups, dim]
     shape_o = [batch, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // groups
 
     part_shape = [batch, heads, num_split, dim]
     valid_block_H = min(block_H, kv_group_num)
     valid_block_N = min(block_N, seqlen_kv // num_split)
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def flashattn_gqa_decode_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
+        # split
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
@@ -96,25 +89,43 @@ def flash_attn(
 
             bid = bx
             hid = by
+            sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_shared)
-                T.copy(mask[bid, k * block_N:(k + 1) * block_N, cur_kv_head], mask_local)
+                T.copy(
+                    K[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    K_shared,
+                )
+                T.copy(
+                    mask[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                    ],
+                    mask_local,
+                )
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split), acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -125,23 +136,66 @@ def flash_attn(
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], V_shared)
+                T.copy(
+                    V[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    V_shared,
+                )
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
-
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+            T.copy(O_shared, Output_partial[bid, hid * valid_block_H : (hid + 1) * valid_block_H, sid, :])
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local = T.alloc_fragment([num_split, 128], dtype)
+            lse_logsum_local = T.alloc_fragment([128], accum_dtype)
+            lse_max_local = T.alloc_fragment([128], accum_dtype)
+            scale_local = T.alloc_fragment([128], accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            for k, j in T.Parallel(num_split, 128):
+                lse_local[k, j] = glse[bz, by, k]
+            T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
+            for k in T.serial(num_split):
+                for j in T.Parallel(128):
+                    lse_logsum_local[j] += T.exp2(lse_local[k, j] - lse_max_local[j])
+            for j in T.Parallel(128):
+                lse_logsum_local[j] = T.log2(lse_logsum_local[j]) + lse_max_local[j]
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                for j in T.Parallel(128):
+                    scale_local[j] = T.exp2(lse_local[k, j] - lse_logsum_local[j])
+                # Note: Pay attention to dim and the number of threads in Parallel
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local[i]
+            for i in T.Parallel(dim):
+                Output[bz, by, i] = o_accum_local[i]
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -160,34 +214,26 @@ def flash_attn_split(
 
             bid = bx
             hid = by
-            sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
-
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], K_shared)
-                T.copy(
-                    mask[bid, (seqlen_kv // num_split) * sid +
-                         k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                         cur_kv_head], mask_local)
+                T.copy(K[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_shared)
+                T.copy(mask[bid, k * block_N : (k + 1) * block_N, cur_kv_head], mask_local)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i,
-                          j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split),
-                                              acc_s[i, j], -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -198,88 +244,14 @@ def flash_attn_split(
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(
-                    V[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], V_shared)
+                T.copy(V[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-            for i in T.Parallel(block_H):
-                if i < valid_block_H:
-                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * valid_block_H:(hid + 1) * valid_block_H,
-                                            sid, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local = T.alloc_fragment([num_split, 128], dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_fragment([128], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
-                # lse_local: (local_id, thread_id)
-                lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            for k, j in T.Parallel(num_split, 128):
-                lse_local[k, j] = glse[bz, by, k]
-            T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def flashattn_gqa_decode_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        flash_attn_split(Q, K, V, mask, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        flash_attn(Q, K, V, mask, Output)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     if num_split > 1:
         return flashattn_gqa_decode_split
@@ -300,27 +272,21 @@ def ref_program(query, key, value, mask, glse, Output_partial):
     dim = query.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
     if mask is not None:
-        mask = rearrange(mask, 'b s h -> b h s')
+        mask = rearrange(mask, "b s h -> b h s")
         mask = mask.unsqueeze(1)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -334,16 +300,12 @@ def flash_split_ref(Q, K, V, mask):
     seqlen_kv = K.size(1)
     num_head_groups = nheads // groups
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float)
-    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N),
-                             device="cuda",
-                             dtype=torch.float16)
+    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, num_head_groups, groups, dim), device="cuda", dtype=torch.float)
     scores_max = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
-    scores_max_prev = torch.empty((batch, num_head_groups, groups),
-                                  device="cuda",
-                                  dtype=torch.float)
+    scores_max_prev = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_scale = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_sum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     logsum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
@@ -351,25 +313,25 @@ def flash_split_ref(Q, K, V, mask):
     glogsum = torch.empty((num_split, batch, nheads), device="cuda", dtype=torch.float)
 
     Q_ = Q * scale
-    Q_ = rearrange(Q_, 'b (h g) d -> b g h d', g=num_head_groups)
+    Q_ = rearrange(Q_, "b (h g) d -> b g h d", g=num_head_groups)
 
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bghd,bkhd->bghk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bghd,bkhd->bghk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             if mask is not None:
-                mask_local = mask[:, (seqlen_kv // num_split) * ks +
-                                  i * block_N:(seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
-                mask_local = rearrange(mask_local, 'b s h -> b h s')
+                mask_local = mask[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
+                mask_local = rearrange(mask_local, "b s h -> b h s")
                 mask_local = mask_local.unsqueeze(1)
-                acc_s = acc_s.masked_fill(mask_local == 0, float('-inf'))
+                acc_s = acc_s.masked_fill(mask_local == 0, float("-inf"))
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -377,15 +339,16 @@ def flash_split_ref(Q, K, V, mask):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bghk,bkhd->bghd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bghk,bkhd->bghd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
-        acc_o_out = rearrange(acc_o, 'b g h d->b (h g) d')
-        logsum_out = rearrange(logsum, 'b g h->b (h g)')
+        acc_o_out = rearrange(acc_o, "b g h d->b (h g) d")
+        logsum_out = rearrange(logsum, "b g h->b (h g)")
         acc_o_out /= logsum_out[:, :, None]
-        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, 'b g h->b (h g)')
+        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, "b g h->b (h g)")
         gacc_o[ks, :, :, :] = acc_o_out
         glogsum[ks, :, :] = logsum_out
 
@@ -421,7 +384,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -429,28 +392,23 @@ def calc_sim(x, y, name="tensor"):
 
 def assert_similar(x, y, eps=1e-2, name="tensor", assert_=False, print_=True):
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if assert_:
-            raise AssertionError(f'{name} Error: {diff}')
+            raise AssertionError(f"{name} Error: {diff}")
     else:
         if print_:
-            print(f'passed: {name} diff={diff}')
+            print(f"passed: {name} diff={diff}")
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         groups: int = 8,
-         kv_seqlen: int = 8192,
-         dim: int = 128,
-         tune: bool = False):
+def main(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128, tune: bool = False):
     batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
     qk_flops = 2 * batch * heads * kv_seqlen * dim
     pv_flops = 2 * batch * heads * kv_seqlen * dim
     total_flops = qk_flops + pv_flops
 
-    if (not tune):
+    if not tune:
         config, sm_version = get_heuristic_config()
         kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
@@ -470,7 +428,7 @@ def main(batch: int = 1,
         print(o_ref)
 
         assert_similar(o, o_ref, name="o_ref")
-        assert_similar(o_ref_split, o_ref, name="o_ref_split")
+        assert_similar(o, o_ref_split, name="o_ref_split")
 
         print("All checks pass.")
         latency = profiler.do_bench(ref_program, warmup=500)
@@ -490,13 +448,21 @@ def main(batch: int = 1,
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128):
+    batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
+    config, _ = get_heuristic_config()
+    kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument('--kv_seqlen', type=int, default=8192, help='kv sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--kv_seqlen", type=int, default=8192, help="kv sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.groups, args.kv_seqlen, args.dim, args.tune)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
index 16924ebe8..864ff3e54 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -1,14 +1,11 @@
 import torch
-import triton
-import triton.language as tl
 import math
 import argparse
 import tilelang
 import tilelang.language as T
-from tilelang.autotuner import autotune
+from tilelang.profiler import do_bench
 
 torch.manual_seed(0)
-tilelang.disable_cache()
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -19,184 +16,13 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-@triton.jit
-def _fwd_inner(
-    q,
-    k_ptrs,
-    v_ptrs,
-    s_ptrs,
-    m_i,
-    l_i,
-    acc,
-    offs_h,
-    mask_h,
-    offs_n,
-    seqlen,
-    softmax_scale,
-    lo,
-    hi,
-    stride_kt,
-    stride_vt,
-    stride_sh,
-    stride_sn,
-    BLOCK_N: tl.constexpr,
-):
-    """Inner loop computation for attention"""
-
-    for blk_idx in tl.range(lo, hi):
-        start_n = blk_idx * BLOCK_N
-        k = tl.load(k_ptrs + start_n * stride_kt, mask=offs_n[None, :] + start_n < seqlen)
-        v = tl.load(v_ptrs + start_n * stride_vt, mask=offs_n[:, None] + start_n < seqlen)
-
-        qk = tl.dot(q, k)
-        qk *= softmax_scale
-        qk += tl.where(offs_n[None, :] + start_n < seqlen, 0, -1.0e9)
-
-        row_max = tl.max(qk, 1)
-        tl.store(s_ptrs + offs_h * stride_sh + blk_idx * stride_sn, row_max, mask=mask_h)
-
-        m_ij = tl.maximum(m_i, row_max)
-        qk -= m_ij[:, None]
-        p = tl.math.exp(qk)
-        l_ij = tl.sum(p, 1)
-        alpha = tl.math.exp(m_i - m_ij)
-        l_i = l_i * alpha + l_ij
-        m_i = m_ij
-        acc *= alpha[:, None]
-        p = p.to(v.type.element_ty)
-        acc += tl.dot(p, v)
-
-    return m_i, l_i, acc
-
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [4, 8]\
-        for num_stages in [2, 4]\
-    ],
-    key=['gqa_group_size', 'BLOCK_N', 'BLOCK_D', 'BLOCK_H'],
-)
-@triton.jit
-def _fwd_kernel_varlen(
-    Q,  # [token_q = b, h_q, dim]
-    K,  # [token_k, h_kv, dim]
-    V,
-    O,
-    S,
-    s_aux,
-    softmax_scale,
-    cu_seqlens_k,
-    stride_qt,
-    stride_qh,
-    stride_qd,
-    stride_kt,
-    stride_kh,
-    stride_kd,
-    stride_vt,
-    stride_vh,
-    stride_vd,
-    stride_ot,
-    stride_oh,
-    stride_od,
-    stride_sb,
-    stride_sh,
-    stride_sn,  #bmask shape [b, q_h, seq/BLOCK_N]
-    gqa_group_size: tl.constexpr,
-    BLOCK_H: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_D: tl.constexpr,
-):
-
-    off_z = tl.program_id(0)
-    off_h_for_kv = tl.program_id(1)
-    off_h_q = off_h_for_kv * gqa_group_size
-
-    cu_k_start = tl.load(cu_seqlens_k + off_z)
-    cu_k_end = tl.load(cu_seqlens_k + off_z + 1)
-
-    seqlen_k = cu_k_end - cu_k_start
-
-    offs_h = tl.arange(0, BLOCK_H)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_D)
-
-    Q_ptrs = Q + off_z * stride_qt + off_h_q * stride_qh
-    K_ptrs = K + (cu_k_start) * stride_kt + off_h_for_kv * stride_kh
-    V_ptrs = V + (cu_k_start) * stride_vt + off_h_for_kv * stride_vh
-    O_ptrs = O + off_z * stride_ot + off_h_q * stride_oh
-    S_ptrs = S + off_z * stride_sb + off_h_q * stride_sh
-
-    mask_h = offs_h < gqa_group_size
-    q = tl.load(
-        Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
-
-    if s_aux is not None:
-        sink = tl.load(s_aux + off_h_q + offs_h, mask=mask_h).to(tl.float32)
-        l_i = tl.zeros([BLOCK_H], dtype=tl.float32)
-        m_i = tl.zeros([BLOCK_H], dtype=tl.float32) + sink
-    else:
-        l_i = tl.full([BLOCK_H], 1.0, dtype=tl.float32)
-        m_i = tl.full([BLOCK_H], float("-inf"), dtype=tl.float32)
-
-    acc = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
-
-    k_ptrs = K_ptrs + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
-    v_ptrs = V_ptrs + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
-
-    lo, hi = 0, tl.cdiv(seqlen_k, BLOCK_N)
-    m_i, l_i, acc = _fwd_inner(
-        q,
-        k_ptrs,
-        v_ptrs,
-        S_ptrs,
-        m_i,
-        l_i,
-        acc,
-        offs_h,
-        mask_h,
-        offs_n,
-        seqlen_k,
-        softmax_scale,
-        lo,
-        hi,
-        stride_kt,
-        stride_vt,
-        stride_sh,
-        stride_sn,
-        BLOCK_N,
-    )
-
-    if s_aux is not None:
-        sink = tl.math.exp(sink - m_i)
-        l_i = l_i + sink
-        acc = acc / l_i[:, None]
-
-    else:
-        l_recip = 1 / l_i[:, None]
-        acc = acc * l_recip
-
-    for blk_idx in tl.range(lo, hi):
-        s = tl.load(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, mask=mask_h)
-        s = tl.exp(s - m_i) / l_i
-        tl.store(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, s, mask=mask_h)
-
-    acc = acc.to(O.dtype.element_ty)
-
-    tl.store(
-        O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od,
-        acc,
-        mask=mask_h[:, None])
-
-
 def get_configs():
     import itertools
+
     block_N = [64, 128]
     block_H = [64]
     num_split = [1]
@@ -204,54 +30,37 @@ def get_configs():
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
-@autotune(configs=get_configs(), warmup=10, rep=10)
-@tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
-def flashattn(batch,
-              heads,
-              k_heads,
-              max_seqlen_kv,
-              total_seqlen_k,
-              dim,
-              has_sink,
-              block_N=128,
-              block_H=64,
-              num_split=1,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(out_idx=[-2, -1])
+def flashattn(
+    batch, heads, k_heads, max_seqlen_kv, total_seqlen_k, dim, has_sink, block_N=128, block_H=64, num_split=1, num_stages=1, threads=128
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [total_seqlen_k, k_heads, dim]
     shape_v = [total_seqlen_k, k_heads, dim]
     shape_o = [batch, heads, dim]
     shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // k_heads
 
     valid_block_H = min(block_H, kv_group_num)
-    # TODO: check if max_seqlen_kv is correct for varlen case
-
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor([batch, heads, dim], dtype),
-            S: T.Tensor(shape_s, dtype),
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
-        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -264,578 +73,148 @@ def flash_attn(
             scores_scale = T.alloc_fragment([block_H], accum_dtype)
             scores_sum = T.alloc_fragment([block_H], accum_dtype)
             logsum = T.alloc_fragment([block_H], accum_dtype)
-            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
-            # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
-            s_aux_shared = T.alloc_shared([block_H], "float32")
-
-            T.annotate_layout({
-                # Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                # K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                # V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                # S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
-
-            bid = bx
-            hid = by
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
+            S_shared_cast = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
+
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
             cur_start_k = cu_seqlens_k[bid]
             cur_end_k = cu_seqlens_k[bid + 1]
             cur_seqlen_k = cur_end_k - cur_start_k
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       K_shared)
+                T.copy(K[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], K_shared)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    # acc_s[i, j] = T.if_then_else(mask_local[j] != 0 and k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                    #                              -T.infinity(accum_dtype))
-                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                # scores_max_prev is m_i
-                # scores_max is row_max->m_ij in triton
                 T.copy(scores_max, S_shared[:, k])
-                # scores_scale is alpha in triton
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
-                # scores_sum is l_ij in triton
-                # logsum is l_i in triton
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       V_shared)
+                T.copy(V[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             if has_sink:
-                T.copy(s_aux[hid * valid_block_H:hid * valid_block_H + block_H], s_aux_shared)
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
                 for i in T.Parallel(block_H):
                     logsum[i] += s_aux_shared[i]
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
                 S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
-            # T.copy(S_shared, S_fragment)
-            # for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
-            #     S_fragment[h, k] = T.exp2((S_fragment[h, k] - scores_max[h]) * scale) / logsum[h]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
-            # T.copy(S_fragment, S_shared)
-            T.copy(S_shared[:valid_block_H, :], S[bid,
-                                                  hid * valid_block_H:(hid + 1) * valid_block_H, :])
-
-    @T.prim_func
-    def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor(shape_o, dtype),
-            S: T.Tensor(shape_s, dtype),
-    ):
-        flash_attn(Q, K, V, cu_seqlens_k, s_aux, Output, S)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            T.copy(S_shared, S_shared_cast)
+            T.copy(S_shared_cast[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
-    # TODO: split version
     return flashattn_gqa_decode_no_split
 
 
-def flash_attn_with_attn_pool_decode_tilelang(
-    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
-    K: torch.Tensor,  ## [tk, k_h, k_dim]
-    V: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlen_k: int,
-    real_max_k_seqlen: int,
-    num_split: int,
-    softmax_scale: float,
-    s_aux: torch.Tensor = None,
-    block_size: int = 64,
-    use_per_kv_head_sparse_index: bool = False,
-    tl_kernel=None,
-):
-    num_tokens, q_h, head_size = Q.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = K.size(1)
-
-    assert Q.dim() == K.dim() == 3
-    assert Q.size(2) == K.size(2)
-    assert cu_seqlens_k.dim() == 1
-    assert head_size in {64, 128, 256}
-    assert Q.is_contiguous()
-    assert K.is_contiguous()
-    assert V.is_contiguous()
-
-    gqa_group_size = q_h // k_h
-
-    O_tl = torch.zeros_like(Q)
-    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)),
-                       dtype=Q.dtype,
-                       device=Q.device)
-    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux)
-
-    if use_per_kv_head_sparse_index:
-        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
-    else:
-        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
-
-    return O_tl, S_tl
-
-
-def flash_attn_with_attn_pool_decode(
-    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
-    K: torch.Tensor,  ## [tk, k_h, k_dim]
-    V: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlen_k: int,
-    real_max_k_seqlen: int,
-    num_split: int,
-    softmax_scale: float,
-    s_aux: torch.Tensor = None,
-    block_size: int = 64,
-    use_per_kv_head_sparse_index: bool = False,
-):
-    num_tokens, q_h, head_size = Q.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = K.size(1)
-
-    assert Q.dim() == K.dim() == 3
-    assert Q.size(2) == K.size(2)
-    assert cu_seqlens_k.dim() == 1
-    assert head_size in {64, 128, 256}
-    assert Q.is_contiguous()
-    assert K.is_contiguous()
-    assert V.is_contiguous()
-
-    gqa_group_size = q_h // k_h
-
-    BLOCK_D = head_size
-    BLOCK_N = block_size
-    BLOCK_H = 64
-
-    O = torch.zeros_like(Q)
-    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)),
-                    dtype=Q.dtype,
-                    device=Q.device)
-
-    def grid(META):
-        return (batch, k_h)
-
-    with torch.cuda.device(Q.device.index):
-        _fwd_kernel_varlen[grid](
-            Q,
-            K,
-            V,
-            O,
-            S,
-            s_aux,
-            softmax_scale,
-            cu_seqlens_k,
-            *Q.stride(),
-            *K.stride(),
-            *V.stride(),
-            *O.stride(),
-            *S.stride(),
-            gqa_group_size,
-            BLOCK_H=BLOCK_H,
-            BLOCK_N=BLOCK_N,
-            BLOCK_D=BLOCK_D,
-        )
-
-    if use_per_kv_head_sparse_index:
-        S = torch.max_pool2d(S, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
-    else:
-        S = torch.max_pool2d(S, kernel_size=(q_h, 1), stride=(q_h, 1))
-
-    return O, S
-
-
-def test_equal_seqlen_decode_main(args):
-    """Test decode kernel with equal sequence lengths"""
-    print("Testing decode kernel with equal sequence lengths")
-
-    batch_size = args.batch_size
-    q_heads = args.q_heads
-    kv_heads = args.kv_heads
-    k_seqlen = args.k_seqlen
-    real_max_k_seqlen = args.k_seqlen
-    head_size = args.head_size
-    block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
-
-    # For decode, query is just 1 token per batch
-    q = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
-    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+def ref_attention(q, k, v, k_seqlens, q_heads, sink=None):
+    """
+    Compute reference attention output and weights.
+    Args:
+        q: [b, q_heads, head_size]
+        k, v: [b, kv_heads, max_seqlen, head_size]
+        k_seqlens: [b] actual sequence lengths
+        sink: [q_heads] optional sink values
+    Returns: output [b, q_heads, head_size], attn_weights [b, q_heads, max_seqlen]
+    """
+    batch_size, kv_heads, max_seqlen, head_size = k.shape
     softmax_scale = 1.0 / math.sqrt(head_size)
 
-    # Generate sink values if needed
-    sink = None
-    if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
-        print(f"Using sink attention with sink values: {sink}")
-
-    # Convert to varlen format for K, V
-    k_varlen = k.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size)
-    v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size)
-
-    # Generate cumulative sequence lengths
-    cu_seqlens_k = torch.arange(
-        0, (batch_size + 1) * k_seqlen, k_seqlen, device='cuda', dtype=torch.int32)
-    max_seqlen_k = k_seqlen
-
-    print(f"q shape: {q.shape}")
-    print(f"k_varlen shape: {k_varlen.shape}")
-    print(f"v_varlen shape: {v_varlen.shape}")
+    # Expand KV heads and compute attention scores
+    k = repeat_kv(k, q_heads // kv_heads)
+    v = repeat_kv(v, q_heads // kv_heads)
+    logits = torch.matmul(q.unsqueeze(2), k.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
-    num_tokens, q_h, head_size = q.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
-
-    # Test our decode kernel
-    O_triton, S_triton = flash_attn_with_attn_pool_decode(
-        q,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size)
-    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
-        q,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size,
-        tl_kernel=tl_kernel,
-    )
-    for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
-
-    # Compute torch reference
-    q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
-    k_repeat = repeat_kv(k, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
-    v_repeat = repeat_kv(v, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
+    # Mask invalid positions
+    mask = torch.arange(max_seqlen, device=q.device).expand(batch_size, -1) >= k_seqlens.unsqueeze(1)
+    logits.masked_fill_(mask.unsqueeze(1).unsqueeze(2), float("-inf"))
 
     if sink is None:
-        # Standard scaled dot-product attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
-        attn_weights = torch.softmax(logits, dim=-1)
-        O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+        attn_weights = logits.softmax(dim=-1)
     else:
-        # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
-
-        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
-        logits_max = torch.max(logits, dim=-1, keepdim=True).values
-        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
-        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
-        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
-        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
-        attn_weights = unnormalized_scores / normalizer
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat).squeeze(2)  # [batch, q_heads, head_size]
-
-    # Compute attention score pooling
-    attn_score_pooled = torch.max_pool2d(
-        attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
-        kernel_size=(q_heads, block_size),
-        stride=(q_heads, block_size),
-        ceil_mode=True).to(torch.float16)
-
-    print("S_tilelang", S_tilelang)
-    print("attn_score_pooled", attn_score_pooled)
-
-    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
-    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_o_tilelang = torch.max(torch.abs(O_tilelang - O_torch))
-    max_diff_s_tilelang = torch.max(torch.abs(S_tilelang - attn_score_pooled))
-
-    print(f"Max difference in O: {max_diff_o.item()}")
-    print(f"Max difference in S: {max_diff_s.item()}")
-    print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
-    print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
-    assert torch.allclose(
-        S_tilelang, attn_score_pooled, atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
-    print("✅ All tests passed!")
+        # Sink attention: softmax with additional sink term
+        sink_expanded = sink.view(1, q_heads, 1, 1)
+        logits_max = torch.maximum(logits.max(dim=-1, keepdim=True).values, sink_expanded)
+        exp_logits = torch.exp(logits - logits_max)
+        attn_weights = exp_logits / (exp_logits.sum(dim=-1, keepdim=True) + torch.exp(sink_expanded - logits_max))
 
+    attn_weights.masked_fill_(mask.unsqueeze(1).unsqueeze(2), 0.0)
+    output = torch.matmul(attn_weights.to(v.dtype), v).squeeze(2)
+    return output, attn_weights.squeeze(2)
 
-def test_varlen_decode_main(args):
-    """Test decode kernel with variable sequence lengths"""
-    batch_size = args.batch_size
-    q_heads = args.q_heads
-    kv_heads = args.kv_heads
-    max_k_seqlen = args.k_seqlen  # Use as max sequence length
-    real_max_k_seqlen = args.k_seqlen
-    head_size = args.head_size
-    block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
-
-    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
 
-    # Generate sink values if needed
-    sink = None
-    if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
-        print(f"Using sink attention with sink values: {sink}")
+def test_varlen_decode_main(args):
+    """Test decode kernel with variable sequence lengths."""
+    batch_size, q_heads, kv_heads = args.batch_size, args.q_heads, args.kv_heads
+    max_k_seqlen, head_size, block_size = args.k_seqlen, args.head_size, args.block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
-    # Generate variable length k sequences
+    # Generate variable length sequences and cumulative lengths
     k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
-    print(f"k_seqlens: {k_seqlens}")
-
-    # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
-    total_k_tokens = 0
-    for i in range(batch_size):
-        cu_seqlens_k[i] = total_k_tokens
-        total_k_tokens += k_seqlens[i]
-    cu_seqlens_k[batch_size] = total_k_tokens
-
-    print(f"cu_seqlens_k: {cu_seqlens_k}")
-
-    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-
-    softmax_scale = 1.0 / math.sqrt(head_size)
-    max_seqlen_k = int(k_seqlens.max())
-
-    print(f"Actual max_seqlen_k: {max_seqlen_k}")
-    print(f"q_decode shape: {q_decode.shape}")
-    print(f"k_varlen shape: {k_varlen.shape}")
-    print(f"v_varlen shape: {v_varlen.shape}")
-
-    num_tokens, q_h, head_size = q_decode.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
-
-    # Test our decode kernel
-    O_triton, S_triton = flash_attn_with_attn_pool_decode(
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size)
-    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size,
-        tl_kernel=tl_kernel,
-    )
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    cu_seqlens_k[1:] = torch.cumsum(k_seqlens, dim=0).to(torch.int32).cuda()
+    total_k_tokens = cu_seqlens_k[-1].item()
+
+    # Generate input tensors
+    q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1 if args.test_sink else None
+
+    # Run tilelang kernel
+    tilelang.disable_cache()
+    tl_kernel = flashattn(batch_size, q_heads, kv_heads, max_k_seqlen, total_k_tokens, head_size, args.test_sink)
+    O_tl, S_tl = tl_kernel(q, k_varlen, v_varlen, cu_seqlens_k, sink)
+    S_tl = torch.max_pool2d(S_tl, kernel_size=(q_heads, 1), stride=(q_heads, 1))
+
+    # Mask out invalid S positions
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
-
-    # Create torch reference - pad tensors for comparison
-    k_padded_list = []
-    v_padded_list = []
+        valid_blocks = math.ceil(k_seqlens[i].item() / block_size)
+        S_tl[i, :, valid_blocks:] = 0
 
+    # Prepare padded tensors for reference
+    actual_max = int(k_seqlens.max())
+    k_padded = torch.zeros(batch_size, kv_heads, actual_max, head_size, device="cuda", dtype=dtype)
+    v_padded = torch.zeros(batch_size, kv_heads, actual_max, head_size, device="cuda", dtype=dtype)
     for i in range(batch_size):
-        actual_k_len = k_seqlens[i]
-
-        # Extract and pad k, v for this batch
-        k_start = cu_seqlens_k[i]
-        k_end = cu_seqlens_k[i + 1]
-
-        # Pad to max_seqlen_k
-        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
-        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
-
-        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
-        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
-
-        k_padded_list.append(k_padded)
-        v_padded_list.append(v_padded)
+        seq_len = k_seqlens[i].item()
+        k_padded[i, :, :seq_len] = k_varlen[cu_seqlens_k[i] : cu_seqlens_k[i + 1]].transpose(0, 1)
+        v_padded[i, :, :seq_len] = v_varlen[cu_seqlens_k[i] : cu_seqlens_k[i + 1]].transpose(0, 1)
 
-    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
-    k_padded_batched = torch.stack(
-        k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-    v_padded_batched = torch.stack(
-        v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-
-    # Expand q to match kv heads: [b, q_heads, 1, head_size]
-    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
-
-    print(f"q_expanded shape: {q_expanded.shape}")
-    print(f"k_padded_batched shape: {k_padded_batched.shape}")
-    print(f"v_padded_batched shape: {v_padded_batched.shape}")
-
-    # Compute torch reference
-    k_repeat = repeat_kv(k_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-    v_repeat = repeat_kv(v_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-
-    if sink is None:
-        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
-        attn_score = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
-
-        # Apply sequence length masking
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_score[i, :, :, actual_k_len:] = float('-inf')
-
-        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
-
-        # Mask out invalid positions
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_weights[i, :, :, actual_k_len:] = 0.0
-
-        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
-    else:
-        # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
-
-        # Apply sequence length masking
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            logits[i, :, :, actual_k_len:] = float('-inf')
-
-        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
-        logits_max = torch.max(logits, dim=-1, keepdim=True).values
-        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
-        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
-        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
-        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
-        attn_weights = unnormalized_scores / normalizer
-
-        # Mask out invalid positions
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_weights[i, :, :, actual_k_len:] = 0.0
-
-        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat)  # [b, q_heads, 1, head_size]
-
-    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
-
-    # Compute attention score pooling for S
-    attn_score_pooled = torch.max_pool2d(
-        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
-        kernel_size=(q_heads, block_size),
-        stride=(q_heads, block_size),
-        ceil_mode=True).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
-
-    print(f"O_triton shape: {O_triton.shape}")
-    print(f"O_tilelang shape: {O_tilelang.shape}")
-    print(f"O_torch shape: {O_torch.shape}")
-    print(f"S_triton shape: {S_triton.shape}")
-    print(f"S_tilelang shape: {S_tilelang.shape}")
-    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+    # Compute reference
+    O_ref, attn_weights = ref_attention(q, k_padded, v_padded, k_seqlens.cuda(), q_heads, sink)
+    S_ref = torch.max_pool2d(attn_weights, kernel_size=(q_heads, block_size), stride=(q_heads, block_size), ceil_mode=True).to(dtype)
 
     # Compare results
-    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
-    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
-    print(f"Max difference in O: {max_diff_o.item()}")
-    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
-
-    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_s_tl = torch.max(
-        torch.abs(S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
-    print(f"Max difference in S: {max_diff_s.item()}")
-    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
-
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
-    assert torch.allclose(
-        S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)],
-        attn_score_pooled,
-        atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tl.item()}"
-
+    num_blocks = math.ceil(actual_max / block_size)
+    assert torch.allclose(O_tl, O_ref, atol=1e-2, rtol=1e-2), f"Output mismatch: {(O_tl - O_ref).abs().max()}"
+    assert torch.allclose(S_tl[:, :, :num_blocks], S_ref[:, :, :num_blocks], atol=1e-2, rtol=1e-2), "Score mismatch"
     print("✅ All tests passed!")
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def speed_benchmark_decode_comparison(args):
     """Speed benchmark for decode kernel"""
     batch_size = args.batch_size
@@ -844,7 +223,7 @@ def speed_benchmark_decode_comparison(args):
     max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print("\n=== Decode Speed Benchmark Comparison ===")
     print("Configuration:")
@@ -865,7 +244,7 @@ def speed_benchmark_decode_comparison(args):
         k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -873,88 +252,68 @@ def speed_benchmark_decode_comparison(args):
     cu_seqlens_k[batch_size] = total_k_tokens
 
     # Generate tensors
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-
-    softmax_scale = 1.0 / math.sqrt(head_size)
-    max_seqlen_k = int(k_seqlens.max())
-
-    # Generate sink values if needed
-    sink = None
-    if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
-        print("  Using sink attention with sink values")
-
-    print("Setup complete:")
-    print(f"  Total K tokens: {total_k_tokens}")
-    print(f"  Actual max K seq len: {max_seqlen_k}")
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1 if args.test_sink else None
     if args.test_varlen:
         print(f"  K sequence lengths: {k_seqlens.tolist()}")
 
-    # Warmup
-    num_tokens, q_h, head_size = q_decode.shape
+    _, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
+
+    def run_once():
+        tl_kernel(q_decode, k_varlen, v_varlen, cu_seqlens_k, sink)
 
     # Benchmark
     print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
     tilelang_time = do_bench(
-        flash_attn_with_attn_pool_decode_tilelang,
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        args.k_seqlen,
-        1,
-        softmax_scale,
-        sink,
-        block_size,
-        False,
-        tl_kernel,
+        run_once,
     )
     print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
 
-    # Benchmark
-    print("⚡ Benchmarking Triton kernel (100 iterations)...")
-    triton_time = do_bench(flash_attn_with_attn_pool_decode, q_decode, k_varlen, v_varlen,
-                           cu_seqlens_k, max_seqlen_k, args.k_seqlen, 1, softmax_scale, sink,
-                           block_size)
-    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
 
-    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
+def main():
+    args = argparse.Namespace(
+        batch_size=1,
+        q_heads=32,
+        kv_heads=8,
+        k_seqlen=8192,
+        head_size=128,
+        block_size=128,
+        dtype=T.float16,
+    )
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+    test_varlen_decode_main(args)
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Flash Attention Decode with Attention Pooling')
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
-    parser.add_argument('--q_heads', type=int, default=32, help='Number of query heads')
-    parser.add_argument('--kv_heads', type=int, default=8, help='Number of key-value heads')
-    parser.add_argument('--k_seqlen', type=int, default=8192, help='Key sequence length')
-    parser.add_argument(
-        '--head_size', type=int, default=128, choices=[64, 128, 256], help='Head dimension')
-    parser.add_argument('--block_size', type=int, default=64, help='Block size for computation')
-    parser.add_argument(
-        '--dtype', type=str, default='bfloat16', choices=['float16', 'bfloat16'], help='Data type')
-    parser.add_argument(
-        '--test_varlen', action='store_true', help='Test with truly variable sequence lengths')
-    parser.add_argument(
-        '--test_sink', action='store_true', help='Test with sink attention mechanism')
-    parser.add_argument('--benchmark', action='store_true', help='Run speed benchmark')
-    parser.add_argument(
-        '--num_split', type=int, default=1, choices=[1, 16], help='Number of splits')
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
     args = parser.parse_args()
     args.test_sink = True
-    args.test_varlen = False
-    args.dtype = 'float16'
+    args.test_varlen = True
+    args.dtype = T.float16
     args.num_split = 1
 
-    if args.benchmark:
-        speed_benchmark_decode_comparison(args)
-    elif args.test_varlen:
-        test_varlen_decode_main(args)
-    else:
-        test_equal_seqlen_decode_main(args)
+    # if args.benchmark:
+    #     speed_benchmark_decode_comparison(args)
+    # else:
+    #     test_varlen_decode_main(args)
+
+    speed_benchmark_decode_comparison(args)
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 3eabc9a76..24a90c57b 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -10,102 +10,24 @@
 
 @tilelang.jit(out_idx=[5])
 def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, seqlen_q, heads, dim]
     shape_kv = [batch, seqlen_kv, heads, dim]
     part_shape = [batch, seqlen_q, heads, num_split, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
-    @T.macro
-    def MMA0(
+    @T.prim_func
+    def flashattn_mha_inference(
+        Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_kv, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        mid: T.int32,
-        hid: T.int32,
-        bid: T.int32,
-        sid: T.int32,
-    ):
-        T.copy(
-            K[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], K_shared)
-        # TODO: Handle causal split case
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
         V: T.Tensor(shape_kv, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        hid: T.int32,
-        bid: T.int32,
-        sid: T.int32,
-    ):
-        T.copy(
-            V[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
+        Output: T.Tensor(shape_q, dtype),
     ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-    ):
-        with T.Kernel(
-                T.ceildiv(seqlen_q, block_M), heads * batch, num_split,
-                threads=128) as (bx, by, bz):
+        # split
+        with T.Kernel(T.ceildiv(seqlen_q, block_M), heads * batch, num_split, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -126,43 +48,73 @@ def flash_attn_split(
 
             # NOTE(wt): tma barrier has some problems with padded dimensions (seq_q here) currently
             # disable relevant tma copy and use SIMT as fallback for now
-            T.copy(Q[bid, mid * block_M:(mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
+            T.copy(Q[bid, mid * block_M : (mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             # TODO: Handle causal split case
             loop_range = (
-                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv(
-                    (mid + 1) * block_M, block_N)) if is_causal else T.ceildiv(
-                        (seqlen_kv // num_split), block_N))
+                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv((mid + 1) * block_M, block_N))
+                if is_causal
+                else T.ceildiv((seqlen_kv // num_split), block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=2):
-                MMA0(K, Q_shared, K_shared, acc_s, k, mid, hid, bid, sid)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, hid, bid, sid)
+                T.copy(
+                    K[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :],
+                    K_shared,
+                )
+                # TODO: Handle causal split case
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # in the first ceil_div(kBlockM, kBlockN) steps.
+                # for i in T.Parallel(block_M):
+                #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(
+                    V[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :],
+                    V_shared,
+                )
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid, sid, mid * block_M:(mid + 1) * block_M])
+            T.copy(logsum, glse[bid, hid, sid, mid * block_M : (mid + 1) * block_M])
             T.copy(acc_o, O_shared)
-            T.copy(
-                O_shared,
-                Output_partial[bid, mid * block_M:(mid + 1) * block_M, hid, sid, :],
-                disable_tma=True)
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_q, dtype),
-    ):
+            T.copy(O_shared, Output_partial[bid, mid * block_M : (mid + 1) * block_M, hid, sid, :], disable_tma=True)
+
+        # combine
         with T.Kernel(T.ceildiv(seqlen_q, block_M), heads, batch, threads=128) as (bx, by, bz):
             po_local = T.alloc_fragment([block_M, dim], dtype)
-            po_shared = T.alloc_shared([block_M, dim], dtype)
             o_accum_local = T.alloc_fragment([block_M, dim], accum_dtype)
             o_shared = T.alloc_shared([block_M, dim], dtype)
             lse_local = T.alloc_fragment([num_split, block_M], dtype)
@@ -171,20 +123,17 @@ def combine(
             lse_max_local = T.alloc_fragment([block_M], accum_dtype)
             scale_local = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
-                o_shared: tilelang.layout.make_swizzled_layout(o_shared),
-                po_shared: tilelang.layout.make_swizzled_layout(po_shared),
-            })
-
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
-            T.copy(glse[
-                bz,
-                by,
-                :,
-                bx * block_M:(bx + 1) * block_M,
-            ], lse_local)
+            T.copy(
+                glse[
+                    bz,
+                    by,
+                    :,
+                    bx * block_M : (bx + 1) * block_M,
+                ],
+                lse_local,
+            )
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=False)
             for k in T.Pipelined(num_split):
                 T.copy(lse_local[k, :], lse_local_split)
@@ -193,11 +142,7 @@ def combine(
             for i in T.Parallel(block_M):
                 lse_logsum_local[i] = T.log2(lse_logsum_local[i]) + lse_max_local[i]
             for k in T.Pipelined(num_split, num_stages=2):
-                T.copy(
-                    Output_partial[bz, bx * block_M:(bx + 1) * block_M, by, k, :],
-                    po_shared,
-                    disable_tma=True)
-                T.copy(po_shared, po_local)
+                T.copy(Output_partial[bz, bx * block_M : (bx + 1) * block_M, by, k, :], po_local)
                 for i in T.Parallel(block_M):
                     lse_local_split[i] = lse_local[k, i]
                 for i in T.Parallel(block_M):
@@ -205,19 +150,7 @@ def combine(
                 for i, j in T.Parallel(block_M, dim):
                     o_accum_local[i, j] += po_local[i, j] * scale_local[i]
             T.copy(o_accum_local, o_shared)
-            T.copy(o_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :], disable_tma=True)
-
-    @T.prim_func
-    def flashattn_mha_inference(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
-            Output: T.Tensor(shape_q, dtype),
-    ):
-        flash_attn_split(Q, K, V, glse, Output_partial)
-        combine(glse, Output_partial, Output)
+            T.copy(o_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :], disable_tma=True)
 
     return flashattn_mha_inference
 
@@ -225,10 +158,10 @@ def flashattn_mha_inference(
 def ref_program(Q, K, V, glse, Output_partial, causal):
     assert causal is False
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -256,7 +189,7 @@ def flash_split_ref(Q, K, V, causal):
     block_N = 128
     seqlen_kv = K.size(1)
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, block_M, nheads, dim), device="cuda", dtype=torch.float)
@@ -273,14 +206,15 @@ def flash_split_ref(Q, K, V, causal):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bqhd,bkhd->bhqk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, seqlen, nheads, block_N]
+            acc_s = torch.einsum(
+                "bqhd,bkhd->bhqk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, seqlen, nheads, block_N]
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [blockM]
             scores_scale = torch.exp2(scores_max_prev - scores_max)
@@ -288,9 +222,10 @@ def flash_split_ref(Q, K, V, causal):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)
             acc_o += torch.einsum(
-                'bhqk,bkhd->bqhd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bhqk,bkhd->bqhd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, :, None].transpose(1, 2)
@@ -298,8 +233,7 @@ def flash_split_ref(Q, K, V, causal):
         gacc_o[ks, :, :, :, :] = acc_o
         glogsum[ks, :, :, :] = logsum
 
-    return glogsum.to(torch.float16).permute(1, 2, 0,
-                                             3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
+    return glogsum.to(torch.float16).permute(1, 2, 0, 3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
 
 
 def main(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
@@ -323,5 +257,13 @@ def main(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
     print("{:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
+    BLOCK_M = 128
+    BLOCK_N = 64
+    kernel = flashattn(BATCH, H, Q_CTX, KV_CTX, D_HEAD, causal, BLOCK_M, BLOCK_N)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/flash_decoding/regression_example_flash_decoding.py b/examples/flash_decoding/regression_example_flash_decoding.py
new file mode 100644
index 000000000..476bceb34
--- /dev/null
+++ b/examples/flash_decoding/regression_example_flash_decoding.py
@@ -0,0 +1,17 @@
+import tilelang.testing
+import example_gqa_decode
+import example_mha_inference
+
+
+def regression_example_gqa_decode():
+    tilelang.testing.process_func(example_gqa_decode.run_regression_perf)
+
+
+def regression_example_mha_inference():
+    tilelang.testing.process_func(
+        example_mha_inference.run_regression_perf, BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/flash_decoding/test_example_flash_decoding.py b/examples/flash_decoding/test_example_flash_decoding.py
index c728dfe0e..2cbcd8404 100644
--- a/examples/flash_decoding/test_example_flash_decoding.py
+++ b/examples/flash_decoding/test_example_flash_decoding.py
@@ -2,9 +2,9 @@
 
 import example_gqa_decode
 import example_mha_inference
+import example_gqa_decode_varlen_logits
 
 
-# TODO(lei): fix the correctness of gqa decode on sm90
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def test_example_example_gqa_decode():
@@ -15,5 +15,9 @@ def test_example_example_mha_inference():
     example_mha_inference.main(BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False)
 
 
+def test_example_example_gqa_decode_varlen_logits():
+    example_gqa_decode_varlen_logits.main()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index a8d684965..4b843cdfe 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -9,17 +9,18 @@
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_shared(d_hidden,
-                                d_expert,
-                                n_shared_experts,
-                                dtype,
-                                num_tokens,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1):
-
+def moe_forward_tilelang_shared(
+    d_hidden,
+    d_expert,
+    n_shared_experts,
+    dtype,
+    num_tokens,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -32,21 +33,19 @@ def moe_forward_tilelang_shared(d_hidden,
     shared_W_up_shape = (dexpert, dhidden)
     shared_W_down_shape = (dhidden, dexpert)
 
-    accum_type = "float32"
+    accum_type = T.float32
 
     @T.prim_func
     def kernel_shared(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
-            shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
-            shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
-            up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
+        shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
+        shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
+        up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
             # Split the block to shared experts and routed experts
             input_shared = T.alloc_fragment((block_token, block_dhidden), dtype=dtype)
             W_gate_shared = T.alloc_shared((block_dexpert, block_dhidden), dtype=dtype)
@@ -70,16 +69,13 @@ def kernel_shared(
 
             # Fuse with SiLU and element-wise product
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             T.copy(up_logits_local, up_logits[bx * block_token, by * block_dexpert])
 
         # Step 2: Compute down logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden), threads=threads) as (bx, by):
             up_logits_shared = T.alloc_fragment((block_token, block_dexpert), dtype=dtype)
             W_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_type)
@@ -97,21 +93,25 @@ def kernel_shared(
     return kernel_shared
 
 
-@tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_routed(d_hidden,
-                                d_expert,
-                                n_routed_experts,
-                                dtype,
-                                group_sum,
-                                group_count,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1,
-                                k_pack=1,
-                                coalesced_width=None):
-
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    }
+)
+def moe_forward_tilelang_routed(
+    d_hidden,
+    d_expert,
+    n_routed_experts,
+    dtype,
+    group_sum,
+    group_count,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -124,7 +124,7 @@ def moe_forward_tilelang_routed(d_hidden,
     # group_count = len(group_sizes_list)
     # M = sum([(group_size + block_token - 1) // block_token for group_size in group_sizes_list])
     M = math.ceil(group_sum / block_token) + group_count
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     # Tensors: Note that input shape is reshape to (bs * seq_len * n_experts_per_token, dhidden) for grouped gemm
     input_shape = (group_sum, dhidden)
@@ -132,22 +132,22 @@ def moe_forward_tilelang_routed(d_hidden,
     routed_expert_gate_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_up_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_down_shape = (n_routed_experts, dhidden, dexpert)
-    routed_expert_weights_shape = (group_sum)
-    group_sizes_shape = (n_routed_experts)
+    routed_expert_weights_shape = group_sum
+    group_sizes_shape = n_routed_experts
 
     @T.prim_func
     def kernel(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
-            routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
-            routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
-            routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
-            group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
-            up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
+        routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
+        routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
+        routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
+        group_sizes: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_padded_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_idx_for_bx: T.Tensor((M,), T.int32),  # type: ignore
+        up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
         with T.Kernel(M, T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
@@ -158,58 +158,41 @@ def kernel(
             gate_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
             up_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
-
-            T.use_swizzle(10, enable=True)
+            T.use_swizzle(10)
 
             m_start_padded = bx * block_token
 
-            cur_group_idx[0] = group_idx_for_bx[bx]
+            cur_group_idx = group_idx_for_bx[bx]
 
-            cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            cur_group_size = group_sizes[cur_group_idx]
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx] + group_offsets[cur_group_idx]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size - (m_start_padded - group_padded_offsets[cur_group_idx])))
 
             T.clear(gate_logits_local)
             T.clear(up_logits_local)
 
             for k in T.Pipelined(T.ceildiv(dhidden, block_dhidden), num_stages=num_stages):
                 T.copy(
-                    input[m_start:m_start + block_token, k * block_dhidden:(k + 1) * block_dhidden],
+                    input[m_start : m_start + block_token, k * block_dhidden : (k + 1) * block_dhidden],
                     input_shared,
-                    coalesced_width=coalesced_width)
+                )
                 T.copy(
-                    routed_expert_gate[cur_group_idx[0],
-                                       by * block_dexpert:(by + 1) * block_dexpert,
-                                       k * block_dhidden:(k + 1) * block_dhidden],
-                    routed_expert_gate_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
+                    routed_expert_gate[
+                        cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_gate_shared,
-                    gate_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                )
+                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, transpose_B=True)
                 T.copy(
-                    routed_expert_up[cur_group_idx[0], by * block_dexpert:(by + 1) * block_dexpert,
-                                     k * block_dhidden:(k + 1) * block_dhidden],
+                    routed_expert_up[
+                        cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_up_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
-                    routed_expert_up_shared,
-                    up_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                )
+                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             for i, j in T.Parallel(block_token, block_dexpert):
@@ -222,60 +205,40 @@ def kernel(
             routed_expert_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
-
-            T.use_swizzle(10, enable=True)
+            T.use_swizzle(10)
 
             m_start_padded = bx * block_token
 
-            cur_group_idx[0] = group_idx_for_bx[bx]
+            cur_group_idx = group_idx_for_bx[bx]
 
-            cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            cur_group_size = group_sizes[cur_group_idx]
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx] + group_offsets[cur_group_idx]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size - (m_start_padded - group_padded_offsets[cur_group_idx])))
 
             T.clear(output_local)
 
             for k in T.Pipelined(T.ceildiv(dexpert, block_dexpert), num_stages=num_stages):
                 T.copy(
-                    up_logits[m_start:m_start + block_token,
-                              k * block_dexpert:(k + 1) * block_dexpert],
+                    up_logits[m_start : m_start + block_token, k * block_dexpert : (k + 1) * block_dexpert],
                     up_logits_shared,
-                    coalesced_width=coalesced_width)
+                )
                 T.copy(
-                    routed_expert_down[cur_group_idx[0],
-                                       by * block_dhidden:(by + 1) * block_dhidden,
-                                       k * block_dexpert:(k + 1) * block_dexpert],
-                    routed_expert_down_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    up_logits_shared,
+                    routed_expert_down[
+                        cur_group_idx, by * block_dhidden : (by + 1) * block_dhidden, k * block_dexpert : (k + 1) * block_dexpert
+                    ],
                     routed_expert_down_shared,
-                    output_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                )
+                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dhidden):
                 if i < actual_rows:
-                    output[m_start + i, by * block_dhidden +
-                           j] = output_local[i, j] * routed_expert_weights[m_start + i]
+                    output[m_start + i, by * block_dhidden + j] = output_local[i, j] * routed_expert_weights[m_start + i]
 
     return kernel
 
 
 class Expert(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 gate: torch.Tensor,
-                 up: torch.Tensor,
-                 down: torch.Tensor,
-                 d_expert: Optional[int] = None):
+    def __init__(self, config: Dict, gate: torch.Tensor, up: torch.Tensor, down: torch.Tensor, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
         self.act_fn = nn.SiLU()
@@ -294,14 +257,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MoEGate(nn.Module):
-
     def __init__(self, config: Dict, weights: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
         self.num_experts: int = config["n_routed_experts"]
         self.d_hidden: int = config["d_hidden"]
 
-        self.W_g_weight = weights['router.weight'].t()
+        self.W_g_weight = weights["router.weight"].t()
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         logits = x @ self.W_g_weight
@@ -312,76 +274,69 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 class MoE(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 shared_kernel: tilelang.JITKernel,
-                 routed_kernel: tilelang.JITKernel,
-                 weights: Dict,
-                 padding_M: int = 128):
+    def __init__(
+        self, config: Dict, shared_kernel: tilelang.JITKernel, routed_kernel: tilelang.JITKernel, weights: Dict, padding_M: int = 128
+    ):
         super().__init__()
         self.config = config
         self.shared_kernel = shared_kernel
         self.routed_kernel = routed_kernel
         self.padding_M = padding_M
-        self.experts = nn.ModuleList([
-            Expert(
-                config,
-                gate=weights[f'experts.{i}.0.weight'],
-                up=weights[f'experts.{i}.1.weight'],
-                down=weights[f'experts.{i}.2.weight']) for i in range(config["n_routed_experts"])
-        ])
+        self.experts = nn.ModuleList(
+            [
+                Expert(
+                    config,
+                    gate=weights[f"experts.{i}.0.weight"],
+                    up=weights[f"experts.{i}.1.weight"],
+                    down=weights[f"experts.{i}.2.weight"],
+                )
+                for i in range(config["n_routed_experts"])
+            ]
+        )
         self.device = torch.device("cuda")
         self.gating_network = MoEGate(config, weights).to(self.device)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = Expert(
             config=config,
-            gate=weights['shared_experts.0.weight'],
-            up=weights['shared_experts.1.weight'],
-            down=weights['shared_experts.2.weight'],
-            d_expert=shared_expert_dim).to(self.device)
+            gate=weights["shared_experts.0.weight"],
+            up=weights["shared_experts.1.weight"],
+            down=weights["shared_experts.2.weight"],
+            d_expert=shared_expert_dim,
+        ).to(self.device)
         self.expert_cache = torch.zeros(
-            (config["batch_size"] * config["seq_len"], config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
-        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts],
-                                                 dim=0)
-        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts],
-                                               dim=0)
-        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts],
-                                                 dim=0)
+            (config["batch_size"] * config["seq_len"], config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
+        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts], dim=0)
         self.stacked_expert_tokens = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.stacked_expert_weights = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.float16, device=self.device
+        )
         self.stacked_expert_tokens_idxs = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.int64,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.int64, device=self.device
+        )
 
         self.up_logits_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_expert"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_expert"]), dtype=torch.float16, device=self.device
+        )
         self.expert_output_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
         self.up_logits_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_expert"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_expert"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.expert_output_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
 
     @torch.no_grad()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -413,22 +368,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             self.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
             self.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
-            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[
-                idxs[start_idx:end_idx]]
+            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
 
         group_sizes = torch.tensor(counts, dtype=torch.int32, device=self.device)
-        group_offset = torch.tensor(
-            tokens_per_expert - counts, dtype=torch.int32, device=self.device)
+        group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=self.device)
 
         group_padded_offsets = [0 for _ in range(len(group_sizes))]
         for i in range(1, len(group_sizes)):
-            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil(
-                (counts[i - 1] + 1) / self.padding_M) * self.padding_M
+            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / self.padding_M) * self.padding_M
 
         block_token = 128
-        M = math.ceil(
-            self.config["batch_size"] * self.config["seq_len"] *
-            self.config["n_experts_per_token"] / block_token) + self.config["n_routed_experts"]
+        M = (
+            math.ceil(self.config["batch_size"] * self.config["seq_len"] * self.config["n_experts_per_token"] / block_token)
+            + self.config["n_routed_experts"]
+        )
         group_idx_for_bx = [0 for _ in range(M)]
 
         for bx in range(M):
@@ -437,8 +390,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 if m_start_padded >= group_padded_offsets[i]:
                     group_idx_for_bx[bx] = i
 
-        group_padded_offsets = torch.tensor(
-            group_padded_offsets, dtype=torch.int32, device=self.device)
+        group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=self.device)
         group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=self.device)
 
         # Multi-stream execution
@@ -448,11 +400,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         with torch.cuda.stream(routed_stream):
             # Tilelang version: Grouped GEMM
-            self.routed_kernel(self.stacked_expert_tokens, self.stacked_expert_w_gate,
-                               self.stacked_expert_w_up, self.stacked_expert_w_down,
-                               self.stacked_expert_weights, group_sizes, group_offset,
-                               group_padded_offsets, group_idx_for_bx, self.up_logits_routed,
-                               self.expert_output_routed)
+            self.routed_kernel(
+                self.stacked_expert_tokens,
+                self.stacked_expert_w_gate,
+                self.stacked_expert_w_up,
+                self.stacked_expert_w_down,
+                self.stacked_expert_weights,
+                group_sizes,
+                group_offset,
+                group_padded_offsets,
+                group_idx_for_bx,
+                self.up_logits_routed,
+                self.expert_output_routed,
+            )
 
             # Scatter reduce
             self.expert_cache = torch.scatter_reduce(
@@ -460,14 +420,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 0,
                 self.stacked_expert_tokens_idxs.view(-1, 1).repeat(1, x_flat.shape[-1]),
                 self.expert_output_routed,
-                reduce='sum')
+                reduce="sum",
+            )
             routed_output = self.expert_cache.view(*orig_shape)
 
         with torch.cuda.stream(shared_stream):
-
-            self.shared_kernel(x_flat, self.shared_expert.W_gate_weight,
-                               self.shared_expert.W_up_weight, self.shared_expert.W_down_weight,
-                               self.up_logits_shared, self.expert_output_shared)
+            self.shared_kernel(
+                x_flat,
+                self.shared_expert.W_gate_weight,
+                self.shared_expert.W_up_weight,
+                self.shared_expert.W_down_weight,
+                self.up_logits_shared,
+                self.expert_output_shared,
+            )
             shared_output = self.expert_output_shared.view(*orig_shape)
 
         torch.cuda.synchronize()
@@ -491,14 +456,15 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     """
     input_tensor, weights, config = data
 
-    dtype_str = "float16"
+    dtype_str = T.float16
 
     shared_kernel = moe_forward_tilelang_shared(
         config["d_hidden"],
         config["d_expert"],
         config["n_shared_experts"],
         dtype=dtype_str,
-        num_tokens=config["batch_size"] * config["seq_len"])
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
     routed_kernel = moe_forward_tilelang_routed(
         config["d_hidden"],
         config["d_expert"],
@@ -511,8 +477,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         block_dexpert=128,
         threads=256,
         num_stages=1,
-        k_pack=1,
-        coalesced_width=2)
+    )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
 
@@ -521,13 +486,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     return output
 
 
-def main(d_hidden=7168,
-         d_expert=2048,
-         n_routed_experts=8,
-         n_shared_experts=1,
-         n_experts_per_token=4,
-         batch_size=1,
-         seq_len=8192):
+def main(d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192):
     config = {
         "dhidden": d_hidden,
         "dexpert": d_expert,
@@ -536,20 +495,131 @@ def main(d_hidden=7168,
         "nexpertspertoken": n_experts_per_token,
         "bs": batch_size,
         "seqlen": seq_len,
-        "seed": 81394
+        "seed": 81394,
     }
 
     data = generate_input(**config)
-
-    torch.cuda.synchronize()
     ref_output = ref_kernel(clone_data(data)).to(torch.float32)
-    torch.cuda.synchronize()
     tilelang_output = custom_kernel(clone_data(data)).to(torch.float32)
-    torch.cuda.synchronize()
-
     torch.testing.assert_close(ref_output, tilelang_output, atol=1e-2, rtol=1e-2)
     print("✅ Tilelang and Torch match")
 
 
+def run_regression_perf(
+    d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192
+):
+    config = {
+        "dhidden": d_hidden,
+        "dexpert": d_expert,
+        "nroutedexperts": n_routed_experts,
+        "nsharedexperts": n_shared_experts,
+        "nexpertspertoken": n_experts_per_token,
+        "bs": batch_size,
+        "seqlen": seq_len,
+        "seed": 81394,
+    }
+    from tilelang.profiler import do_bench
+
+    data = generate_input(**config)
+
+    x, weights, config = data
+
+    dtype_str = "float16"
+
+    shared_kernel = moe_forward_tilelang_shared(
+        config["d_hidden"],
+        config["d_expert"],
+        config["n_shared_experts"],
+        dtype=dtype_str,
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
+    routed_kernel = moe_forward_tilelang_routed(
+        config["d_hidden"],
+        config["d_expert"],
+        config["n_routed_experts"],
+        dtype=dtype_str,
+        group_sum=config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
+        group_count=config["n_routed_experts"],
+        block_token=128,
+        block_dhidden=128,
+        block_dexpert=128,
+        threads=256,
+        num_stages=1,
+        coalesced_width=2,
+    )
+
+    moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
+    batch_size, seq_len, hidden_dim = x.shape
+    expert_indices, expert_scores = moe.gating_network(x)
+    flat_expert_indices = expert_indices.view(-1)
+    flat_expert_weights = expert_scores.view(-1)
+    x_flat = x.view(-1, hidden_dim)
+    idxs = flat_expert_indices.argsort()
+    counts = flat_expert_indices.bincount().cpu().numpy()
+    tokens_per_expert = counts.cumsum()
+    num_per_tok = moe.config["n_experts_per_token"]
+    token_idxs = idxs // num_per_tok
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        if start_idx == end_idx:
+            continue
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        expert_tokens = x_flat[exp_token_idxs]
+        moe.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
+        moe.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
+        moe.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
+    group_sizes = torch.tensor(counts, dtype=torch.int32, device=moe.device)
+    group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=moe.device)
+    group_padded_offsets = [0 for _ in range(len(group_sizes))]
+    for i in range(1, len(group_sizes)):
+        group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / moe.padding_M) * moe.padding_M
+    block_token = 128
+    M = (
+        math.ceil(moe.config["batch_size"] * moe.config["seq_len"] * moe.config["n_experts_per_token"] / block_token)
+        + moe.config["n_routed_experts"]
+    )
+    group_idx_for_bx = [0 for _ in range(M)]
+    for bx in range(M):
+        m_start_padded = bx * block_token
+        for i in range(moe.config["n_routed_experts"]):
+            if m_start_padded >= group_padded_offsets[i]:
+                group_idx_for_bx[bx] = i
+    group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=moe.device)
+    group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=moe.device)
+
+    def run_shared_kernel_only():
+        moe.routed_kernel(
+            moe.stacked_expert_tokens,
+            moe.stacked_expert_w_gate,
+            moe.stacked_expert_w_up,
+            moe.stacked_expert_w_down,
+            moe.stacked_expert_weights,
+            group_sizes,
+            group_offset,
+            group_padded_offsets,
+            group_idx_for_bx,
+            moe.up_logits_routed,
+            moe.expert_output_routed,
+        )
+
+    def run_routed_kernel_only():
+        moe.routed_kernel(
+            moe.stacked_expert_tokens,
+            moe.stacked_expert_w_gate,
+            moe.stacked_expert_w_up,
+            moe.stacked_expert_w_down,
+            moe.stacked_expert_weights,
+            group_sizes,
+            group_offset,
+            group_padded_offsets,
+            group_idx_for_bx,
+            moe.up_logits_routed,
+            moe.expert_output_routed,
+        )
+
+    return do_bench(run_routed_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
+    tilelang.disable_cache()
     main()
diff --git a/examples/fusedmoe/example_fusedmoe_torch.py b/examples/fusedmoe/example_fusedmoe_torch.py
index 00219c6e9..6b6322aff 100644
--- a/examples/fusedmoe/example_fusedmoe_torch.py
+++ b/examples/fusedmoe/example_fusedmoe_torch.py
@@ -6,7 +6,6 @@
 
 # Reference code in PyTorch
 class ExpertTorch(nn.Module):
-
     def __init__(self, config: Dict, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -25,7 +24,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MoEGateTorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
@@ -43,12 +41,10 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 class MoETorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
-        self.experts = nn.ModuleList(
-            [ExpertTorch(config) for _ in range(config["n_routed_experts"])])
+        self.experts = nn.ModuleList([ExpertTorch(config) for _ in range(config["n_routed_experts"])])
         self.gating_network = MoEGateTorch(config)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = ExpertTorch(config=config, d_expert=shared_expert_dim)
@@ -67,8 +63,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return routed_output + shared_output
 
     @torch.no_grad()
-    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
-                  flat_expert_weights: torch.Tensor) -> torch.Tensor:
+    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor, flat_expert_weights: torch.Tensor) -> torch.Tensor:
         expert_cache = torch.zeros_like(x)
         # test_expert_cache = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
         # test_expert_tokens = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
@@ -91,8 +86,7 @@ def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
             expert_out = expert(expert_tokens)
 
             expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-            expert_cache.scatter_reduce_(
-                0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+            expert_cache.scatter_reduce_(0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce="sum")
 
         return expert_cache
 
@@ -116,21 +110,21 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     moe = MoETorch(config)
 
     # Fill in the given weights of the model
-    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+    moe.gating_network.W_g.weight = nn.Parameter(weights["router.weight"])
 
     for i in range(num_experts):
-        gate_proj_weight = weights[f'experts.{i}.0.weight']
-        up_proj_weight = weights[f'experts.{i}.1.weight']
-        down_proj_weight = weights[f'experts.{i}.2.weight']
+        gate_proj_weight = weights[f"experts.{i}.0.weight"]
+        up_proj_weight = weights[f"experts.{i}.1.weight"]
+        down_proj_weight = weights[f"experts.{i}.2.weight"]
 
         # Transpose weights to match expected shape for nn.Linear
         moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
         moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
         moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
 
-    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
-    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
-    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights["shared_experts.0.weight"].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights["shared_experts.1.weight"].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights["shared_experts.2.weight"].t())
 
     output = moe(input_tensor)
 
@@ -140,10 +134,9 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
 # Input generation for the reference code
 
 
-def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int,
-                   nexpertspertoken: int, bs: int, seqlen: int,
-                   seed: int) -> Tuple[torch.Tensor, Dict, Dict]:
-
+def generate_input(
+    dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int, nexpertspertoken: int, bs: int, seqlen: int, seed: int
+) -> Tuple[torch.Tensor, Dict, Dict]:
     # Really dumb but for now _ isn't parsing correctly.
     d_hidden = dhidden
     d_expert = dexpert
@@ -163,50 +156,40 @@ def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexper
         "seq_len": seq_len,
     }
 
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
 
     num_experts = n_routed_experts
     expert_dim = d_expert
     weights = {}
 
-    input_tensor = torch.randn((batch_size, seq_len, d_hidden),
-                               device='cuda',
-                               dtype=torch.float16,
-                               generator=gen).contiguous()
+    input_tensor = torch.randn((batch_size, seq_len, d_hidden), device="cuda", dtype=torch.float16, generator=gen).contiguous()
 
     # Initialize router weights
-    weights['router.weight'] = torch.randn(
-        (num_experts, d_hidden), device="cuda", dtype=torch.float16,
-        generator=gen) / math.sqrt(d_hidden)
+    weights["router.weight"] = torch.randn((num_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen) / math.sqrt(d_hidden)
 
     for i in range(num_experts):
-        weights[f'experts.{i}.0.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.1.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.2.weight'] = torch.randn(
-            (expert_dim, d_hidden), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(d_hidden)
-
-    weights['shared_experts.0.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.1.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.2.weight'] = torch.randn((expert_dim * n_shared_experts, d_hidden),
-                                                     device='cuda',
-                                                     dtype=torch.float16,
-                                                     generator=gen) / math.sqrt(d_hidden)
+        weights[f"experts.{i}.0.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.1.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.2.weight"] = torch.randn(
+            (expert_dim, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(d_hidden)
+
+    weights["shared_experts.0.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.1.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.2.weight"] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(d_hidden)
 
     return (input_tensor, weights, config)
 
diff --git a/examples/fusedmoe/regression_example_fusedmoe.py b/examples/fusedmoe/regression_example_fusedmoe.py
new file mode 100644
index 000000000..ac0f18aae
--- /dev/null
+++ b/examples/fusedmoe/regression_example_fusedmoe.py
@@ -0,0 +1,19 @@
+import tilelang.testing
+import example_fusedmoe_tilelang
+
+
+def regression_example_fusedmoe_tilelang():
+    tilelang.testing.process_func(
+        example_fusedmoe_tilelang.run_regression_perf,
+        d_hidden=1024,
+        d_expert=256,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        n_experts_per_token=4,
+        batch_size=1,
+        seq_len=1024,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/fusedmoe/test_example_fusedmoe.py b/examples/fusedmoe/test_example_fusedmoe.py
index 806aff49e..ba8415895 100644
--- a/examples/fusedmoe/test_example_fusedmoe.py
+++ b/examples/fusedmoe/test_example_fusedmoe.py
@@ -4,13 +4,8 @@
 
 def test_example_fusedmoe_tilelang():
     example_fusedmoe_tilelang.main(
-        d_hidden=1024,
-        d_expert=256,
-        n_routed_experts=8,
-        n_shared_experts=1,
-        n_experts_per_token=4,
-        batch_size=1,
-        seq_len=1024)
+        d_hidden=1024, d_expert=256, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=1024
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index 518b0ee21..466c47182 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -4,6 +4,7 @@
 
 import tilelang
 import tilelang.language as T
+from tilelang.profiler import do_bench
 
 print(tilelang.__file__, flush=True)
 
@@ -12,6 +13,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__, flush=True)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_bwd_dhu
 except ImportError:
@@ -24,7 +26,7 @@
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-from utils import *
+from test_utils import assert_similar
 
 
 def prepare_input(
@@ -49,6 +51,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -125,8 +128,11 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
     DV = dv.shape[-1]
     block_S = 64
     BS = S // block_S
-    dh, dh0, dv2 = torch.empty((B, BS, H, DK, DV), dtype=output_dtype), torch.empty(
-        (B, H, DK, DV), dtype=state_dtype), torch.empty((B, S, H, DV), dtype=output_dtype)
+    dh, dh0, dv2 = (
+        torch.empty((B, BS, H, DK, DV), dtype=output_dtype),
+        torch.empty((B, H, DK, DV), dtype=state_dtype),
+        torch.empty((B, S, H, DV), dtype=output_dtype),
+    )
     dh_tmp = torch.empty((B, H, DK, DV), dtype=accum_dtype)
     dv_tmp = torch.empty((B, S, H, DV), dtype=accum_dtype)
     Q_tmp = torch.empty((B, S, H, DK), dtype=accum_dtype)
@@ -138,34 +144,30 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
 
     for i_s in range(BS - 1, -1, -1):
         dh[:, i_s, :, :, :] = dh_tmp
-        dv_tmp = torch.matmul(K[:, i_s * block_S:(i_s + 1) * block_S, :, :].permute(0, 2, 1, 3),
-                              dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
+        dv_tmp = torch.matmul(K[:, i_s * block_S : (i_s + 1) * block_S, :, :].permute(0, 2, 1, 3), dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
         if use_g:
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 for i_s2 in range(block_S):
-                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2,
-                                                                    i_h] <= 0:
-                        dv_tmp[i_b, i_s2,
-                               i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] -
-                                                    G[i_b, i_s * block_S + i_s2, i_h])
+                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h] <= 0:
+                        dv_tmp[i_b, i_s2, i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h])
                     else:
                         dv_tmp[i_b, i_s2, i_h, :] = 0
-        dv_tmp += dv[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dv2[:, i_s * block_S:(i_s + 1) * block_S, :, :] = dv_tmp
+        dv_tmp += dv[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dv2[:, i_s * block_S : (i_s + 1) * block_S, :, :] = dv_tmp
 
         if use_g:
             G_last = G[:, i_s * block_S + block_S - 1, :]
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 dh_tmp[i_b, i_h, :, :] *= torch.exp(G_last[i_b, i_h])
-            Q_tmp = Q[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+            Q_tmp = Q[:, i_s * block_S : (i_s + 1) * block_S, :, :]
             for i_s2 in range(block_S):
                 for i_k in range(DK):
                     Q_tmp[:, i_s2, :, i_k] *= torch.exp(G[:, i_s * block_S + i_s2, :])
         Q_tmp *= scale
-        W_tmp = W[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dO_tmp = dO[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        W_tmp = W[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dO_tmp = dO[:, i_s * block_S : (i_s + 1) * block_S, :, :]
 
         torch.backends.cuda.matmul.allow_tf32 = True
         dh_tmp += torch.matmul(Q_tmp.permute(0, 2, 3, 1), dO_tmp.permute(0, 2, 1, 3))
@@ -223,25 +225,24 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
     @T.prim_func
     def kernel(
-            # Input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            h0: T.Tensor(h0_shape, dtype=input_dtype),
-            dht: T.Tensor(dht_shape, dtype=input_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            # Output
-            dh: T.Tensor(dh_shape, dtype=output_dtype),
-            dh0: T.Tensor(dh0_shape, dtype=state_dtype),
-            dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
 
             b_dh_shared = T.alloc_shared((DK, block_DV), dtype=output_dtype)
-            b_dh_shared_fp32 = T.alloc_shared((DK, block_DV), dtype=state_dtype)
             b_dh_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
             b_dh_fragment_1 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
             b_dh_fragment_2 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
@@ -249,17 +250,14 @@ def kernel(
             dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
-            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype="float32")
-            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype="float32")
-            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype="float32")
+            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype=T.float32)
+            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype=T.float32)
+            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype=T.float32)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype="float32")
             W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
-            G_last_local = T.alloc_local((1), dtype=gate_dtype)
-            G_last_local_exp = T.alloc_local((1), dtype=gate_dtype)
             G_shared = T.alloc_shared((block_S), dtype=gate_dtype, scope="shared")
             G_fragment = T.alloc_fragment((block_S), dtype=gate_dtype)
             G_fragment_post = T.alloc_fragment((block_S), dtype=gate_dtype)
@@ -269,20 +267,15 @@ def kernel(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
-                b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-            })
+            T.annotate_layout(
+                {
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             if use_final_state_gradient:
-                T.copy(dht[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_dh_shared)
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
                 T.copy(b_dh_shared, b_dh_fragment)
             else:
                 T.clear(b_dh_fragment)
@@ -293,57 +286,45 @@ def kernel(
 
                 # Store the updated dh
                 T.copy(b_dh_fragment, b_dh_shared)
-                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dv
-                T.copy(K[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
                 T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
 
                 if use_g:
-                    T.copy(
-                        G[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh],
-                        G_shared,
-                        disable_tma=True)
+                    T.copy(G[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh], G_shared, disable_tma=True)
                     T.copy(G_shared, G_fragment)
-                    G_last_local[0] = G_shared[block_S - 1]
-                    G_last_local_exp[0] = T.exp(G_last_local[0])
+                    G_last_local = G_shared[block_S - 1]
+                    G_last_local_exp = T.exp(G_last_local)
                     for i_s2 in T.Parallel(block_S):
-                        G_fragment_post[i_s2] = T.exp(G_last_local[0] - G_fragment[i_s2])
+                        G_fragment_post[i_s2] = T.exp(G_last_local - G_fragment[i_s2])
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
-                        # with T.If(G_last_local[0] - G_shared[i_s2] <= 0):
-                        with T.If(G_last_local[0] - G_fragment[i_s2] <= 0):
-                            with T.Then():
-                                dv_fragment[i_s2,
-                                            i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
-                            with T.Else():
-                                dv_fragment[i_s2, i_v] = 0
-
-                T.copy(
-                    dv[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dv_shared)
+                        dv_fragment[i_s2, i_v] = (
+                            dv_fragment[i_s2, i_v] * G_fragment_post[i_s2] if G_last_local - G_fragment[i_s2] <= 0 else 0
+                        )
+
+                T.copy(dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared)
                 T.copy(dv_shared, dv_fragment_2)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
 
                 # Store the updated dv
                 T.copy(dv_fragment, dv_shared)
-                T.copy(
-                    dv_shared, dv2[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                                   bv * block_DV:(bv + 1) * block_DV])
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dh
-                T.copy(Q[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
-                T.copy(W[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
 
                 T.clear(Q_fragment)
                 if use_g:
                     for i_k, i_v in T.Parallel(DK, block_DV):
-                        b_dh_fragment[i_k, i_v] *= G_last_local_exp[0]
+                        b_dh_fragment[i_k, i_v] *= G_last_local_exp
                     T.copy(Q_shared, Q_fragment)
                     for i_s2 in T.Parallel(block_S):
                         G_fragment_exp[i_s2] = T.exp(G_shared[i_s2])
                     for i_s2, i_k in T.Parallel(block_S, DK):
-                        # Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * T.exp(G_shared[i_s2]) * scale
                         Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * G_fragment_exp[i_s2] * scale
                 else:
                     T.copy(Q_shared, Q_fragment)
@@ -353,9 +334,7 @@ def kernel(
                 for i_s2, i_k in T.Parallel(block_S, DK):
                     Q_fragment_t[i_k, i_s2] = Q_fragment[i_s2, i_k]
 
-                T.copy(
-                    dO[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dO_shared)
+                T.copy(dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared)
                 T.copy(dO_shared, dO_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dO_fragment_t[i_v, i_s2] = dO_fragment[i_s2, i_v]
@@ -369,7 +348,7 @@ def kernel(
                     b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] - b_dh_fragment_2[i_k, i_v]
 
             if use_initial_state:
-                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -444,44 +423,61 @@ def run_test(
     num_stages=0,
     use_torch=False,
 ):
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    dh_ref, dh0_ref, dv2_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, gate_dtype),
-                                              getattr(torch, state_dtype))
-    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                             getattr(torch, output_dtype),
-                                                             getattr(torch, gate_dtype),
-                                                             getattr(torch, state_dtype))
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dh_ref, dh0_ref, dv2_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
     print("fla running...", flush=True)
     if use_g:
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
     else:
         G = G.fill_(0)
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
 
     # tilelang
     print("tilelang running...", flush=True)
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, scale, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     # kernel = tilelang.compile(program)
     print(kernel.get_kernel_source())
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
 
-    fla_time = do_bench(
-        chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
+    fla_time = do_bench(chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
     tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
 
     print(f"fla time: {fla_time} ms")
@@ -496,19 +492,47 @@ def run_test(
         print("torch running...", flush=True)
         if use_g:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, G, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                G,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
         else:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, None, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                None,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
@@ -521,31 +545,6 @@ def run_test(
         assert_similar(dv2_ref_torch, dv2_tilelang, 1e-5, "torch-tilelang", data="dv2")
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def main():
     DK = 128
     run_test(
@@ -554,11 +553,11 @@ def main():
         H=8,
         DK=DK,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         use_g=True,
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 4d6b657ff..c34d9b530 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -3,12 +3,15 @@
 import sys  # noqa: F401
 import tilelang
 import tilelang.language as T
+from tilelang.autotuner import autotune
+from tilelang.profiler import do_bench
 
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
 except ImportError:
@@ -19,7 +22,7 @@
 import torch.nn.functional as F
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-from utils import *
+from test_utils import assert_similar
 
 # (zhengju) We can slightly modify the generated cuda code from tilelang lowering
 # in the debug folder to make the performance better. To enable this callback,
@@ -55,6 +58,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -80,7 +84,21 @@ def prepare_output(
     return h, final_state, V_new
 
 
-@tilelang.jit(out_idx=[-3, -2, -1])
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
 def tilelang_chunk_gated_delta_rule_fwd_h(
     # task config
     B,
@@ -94,15 +112,15 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
     gate_dtype,
     state_dtype,
     chunk_size,
-    use_g=True,
-    use_initial_state=True,
-    store_final_state=True,
-    save_new_value=True,
+    use_g,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
     # kernel config
     block_DK=64,
-    block_DV=64,
-    threads=256,
-    num_stages=0,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
 ):
     block_S = chunk_size
     BS = S // block_S
@@ -118,14 +136,14 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            U: T.Tensor(U_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=output_dtype),
-            final_state: T.Tensor(final_state_shape, dtype=state_dtype),
-            V_new: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -139,39 +157,35 @@ def kernel(
             V_new_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             V_new_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            G_last_local = T.alloc_local((1), dtype=gate_dtype)
+            G_last_local = T.alloc_var(T.float32)
             G_shared = T.alloc_shared((block_S, block_DV), dtype=gate_dtype)
             G_fragment = T.alloc_fragment((block_S, block_DV), dtype=gate_dtype)
 
-            T.annotate_layout({
-                b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                G_shared: tilelang.layout.make_swizzled_layout(G_shared),
-            })
+            T.annotate_layout(
+                {
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    G_shared: tilelang.layout.make_swizzled_layout(G_shared),
+                }
+            )
 
             T.use_swizzle(10)
 
             if use_initial_state:
-                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_h_shared)
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
                 T.copy(b_h_shared, b_h_fragment)
             else:
                 T.clear(b_h_fragment)
 
             for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
                 # Store previous result to the hidden tensor, like the epilogue
-                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Recurrence
-                T.copy(W[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], W_shared)
                 T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
 
                 # U - W * S
-                T.copy(
-                    U[bb, i_s * block_S:(i_s + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                    U_shared)
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
                 T.copy(U_shared, U_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
@@ -179,27 +193,24 @@ def kernel(
                 # Save V_new
                 if save_new_value:
                     T.copy(V_new_fragment, dst=V_new_shared)
-                    T.copy(
-                        V_new_shared, V_new[bb, i_s * block_S:(i_s + 1) * block_S, bh,
-                                            bv * block_DV:(bv + 1) * block_DV])
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
-                T.copy(K[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
                 # use_g
                 if use_g:
-                    G_last_local[0] = G[bb, (i_s + 1) * block_S - 1, bh]
+                    G_last_local = G[bb, (i_s + 1) * block_S - 1, bh]
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
                         G_shared[i_s2, i_v] = G[bb, i_s * block_S + i_s2, bh]
                     T.copy(G_shared, G_fragment)
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
-                        with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
-                            with T.Then():
-                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp(
-                                    G_last_local[0] - G_fragment[i_s2, i_v])
-                            with T.Else():
-                                V_new_fragment[i_s2, i_v] = 0
-                    G_last_local[0] = T.exp(G_last_local[0])
+                        V_new_fragment[i_s2, i_v] = (
+                            V_new_fragment[i_s2, i_v] * T.exp2((G_last_local - G_fragment[i_s2, i_v]) * 1.442695)
+                            if G_last_local - G_fragment[i_s2, i_v] <= 0
+                            else 0
+                        )
+                    G_last_local = T.exp2(G_last_local * 1.442695)
                     for i_k, i_v in T.Parallel(DK, block_DV):
-                        b_h_fragment[i_k, i_v] *= G_last_local[0]
+                        b_h_fragment[i_k, i_v] *= G_last_local
 
                 # Update intermediate results
                 T.copy(V_new_fragment, V_new_shared)
@@ -209,36 +220,11 @@ def kernel(
 
             # Save final state
             if store_final_state:
-                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def run_test(
     B,
     S,
@@ -260,47 +246,77 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    h_ref, final_state_ref, V_new_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, state_dtype))
-    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                                      getattr(torch, output_dtype),
-                                                                      getattr(torch, state_dtype))
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
-    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(K, W, U, G, initial_state,
-                                                                     store_final_state, chunk_size,
-                                                                     save_new_value)
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
 
     # tilelang
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
     h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
 
-    fla_time = do_bench(chunk_gated_delta_rule_fwd_h, K, W, U, G, initial_state, store_final_state,
-                        chunk_size, save_new_value)
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
     tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
 
     # check correctness
     try:
         h_ref_fp32 = h_ref.to(torch.float32)
         h_tilelang_fp32 = h_tilelang.to(torch.float32)
-        assert_similar(
-            h_ref_fp32,
-            h_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd h",
-            raise_assert=False)
+        assert_similar(h_ref_fp32, h_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd h", raise_assert=False)
         print("tilelang chunk gated delta rule fwd h passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd h failed ✗")
@@ -314,7 +330,8 @@ def run_test(
             final_state_tilelang_fp32,
             eps=1e-5,
             name="tilelang chunk gated delta rule fwd final_state",
-            raise_assert=False)
+            raise_assert=False,
+        )
         print("tilelang chunk gated delta rule fwd final_state passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd final_state failed ✗")
@@ -323,12 +340,7 @@ def run_test(
     try:
         V_new_ref_fp32 = V_new_ref.to(torch.float32)
         V_new_tilelang_fp32 = V_new_tilelang.to(torch.float32)
-        assert_similar(
-            V_new_ref_fp32,
-            V_new_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd V_new",
-            raise_assert=False)
+        assert_similar(V_new_ref_fp32, V_new_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd V_new", raise_assert=False)
         print("tilelang chunk gated delta rule fwd V_new passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd V_new failed ✗")
@@ -345,20 +357,20 @@ def main():
         H=32,
         DK=128,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         use_g=True,
-        use_initial_state=True,
+        use_initial_state=False,
         store_final_state=True,
         save_new_value=True,
-        block_DK=64,
+        block_DK=32,
         block_DV=32,
         threads=128,
-        num_stages=1,
+        num_stages=2,
     )
 
 
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index 1c084be70..bb95f555f 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_fwd_o
 except ImportError:
@@ -87,16 +88,14 @@ def tilelang_chunk_fwd_o(
 
     @T.prim_func
     def kernel(
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            O: T.Tensor(O_shape, dtype=output_dtype),
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        O: T.Tensor(O_shape, dtype=output_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bv, bs, bbh):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
             bb, bh = bbh // H, bbh % H
             Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
@@ -109,28 +108,13 @@ def kernel(
             G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=gate_dtype)
 
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                H_shared: tilelang.layout.make_swizzled_layout(H_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
             T.clear(A_fragment)
             T.clear(O_fragment)
             T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    Q_shared)
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    HIDDEN[bb, bs, bh, i_k * block_DK:(i_k + 1) * block_DK,
-                           bv * block_DV:(bv + 1) * block_DV], H_shared)
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], H_shared)
                 T.gemm(Q_shared, H_shared, O_fragment)
                 T.gemm(Q_shared, K_shared, A_fragment, transpose_B=True)
 
@@ -143,20 +127,17 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(G_diff_local[i_s1, i_s2] <= 0):
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
-                        with T.Else():
-                            A_fragment[i_s1, i_s2] = 0
+                    A_fragment[i_s1, i_s2] = T.if_then_else(
+                        G_diff_local[i_s1, i_s2] <= 0,
+                        A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2]),
+                        0,
+                    )
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 < i_s2):  # noqa: SIM117
-                    with T.Then():
-                        A_fragment[i_s1, i_s2] = 0
+                if i_s1 < i_s2:
+                    A_fragment[i_s1, i_s2] = 0
 
-            T.copy(V[bb, bs * block_S:(bs + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                   V_shared)
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)
             T.copy(A_fragment, A_shared)
             T.gemm(A_shared, V_shared, O_fragment)
 
@@ -164,8 +145,7 @@ def kernel(
                 O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * scale
 
             T.copy(O_fragment, O_shared)
-            T.copy(O_shared, O[bb, bs * block_S:(bs + 1) * block_S, bh,
-                               bv * block_DV:(bv + 1) * block_DV])
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -191,8 +171,9 @@ def run_test(
     output_dtype_torch = getattr(torch, output_dtype)
     accum_dtype_torch = getattr(torch, accum_dtype)
     gate_dtype_torch = getattr(torch, gate_dtype)
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, input_dtype_torch,
-                                       output_dtype_torch, accum_dtype_torch, gate_dtype_torch)
+    Q, K, V, HIDDEN, G = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
     scale = 1.0 / DK**0.5
 
     O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
@@ -200,9 +181,25 @@ def run_test(
 
     block_S = chunk_size
     O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)
 
     try:
@@ -221,10 +218,10 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
         use_g=True,
         block_DK=128,
         block_DV=128,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 7e87a2c4f..19233de62 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -12,6 +12,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_bwd_dqkwg
 except ImportError:
@@ -19,7 +20,7 @@
     fla = None
 
 import torch
-from utils import *
+from test_utils import assert_similar
 
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
@@ -108,10 +109,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_chunk_o_bwd_dqkwg(
     # task config
     B,
@@ -155,25 +154,23 @@ def tilelang_chunk_o_bwd_dqkwg(
 
     @T.prim_func
     def kernel(
-            # input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dh: T.Tensor(dh_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            # output
-            dq: T.Tensor(dq_shape, dtype=output_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dw: T.Tensor(dw_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dh: T.Tensor(dh_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        # output
+        dq: T.Tensor(dq_shape, dtype=output_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dw: T.Tensor(dw_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bk, bs, bbh):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
             bb, bh = bbh // H, bbh % H
 
             V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
@@ -202,27 +199,27 @@ def kernel(
             dg_fragment = T.alloc_fragment((block_S,), dtype=gate_dtype)
             dg_fragment_2 = T.alloc_fragment((block_S,), dtype=gate_dtype)
             dg_fragment_final = T.alloc_fragment((block_S,), dtype=gate_dtype)
-            dg_last_local = T.alloc_local((2,), dtype=gate_dtype)
+            dg_last_local_0 = T.alloc_var(dtype=gate_dtype)
+            dg_last_local_1 = T.alloc_var(dtype=gate_dtype)
+            G_last_local = T.alloc_var(dtype=gate_dtype)
+
             dg_last_fragment = T.alloc_fragment((block_DV * block_DK), dtype=gate_dtype)
             dg_last_fragment_scalar = T.alloc_fragment((1,), dtype=gate_dtype)
             dg_last_fragment_2 = T.alloc_fragment((block_S * block_DK), dtype=gate_dtype)
             dg_last_fragment_scalar_2 = T.alloc_fragment((1,), dtype=gate_dtype)
-            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype, scope="shared")
-            G_last_local = T.alloc_local((1,), dtype=gate_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                h_shared: tilelang.layout.make_swizzled_layout(h_shared),
-                dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                q_shared: tilelang.layout.make_swizzled_layout(q_shared),
-                k_shared: tilelang.layout.make_swizzled_layout(k_shared),
-            })
-
-            T.clear(dg_last_local)
+            T.annotate_layout(
+                {
+                    q_shared: tilelang.layout.make_swizzled_layout(q_shared),
+                    k_shared: tilelang.layout.make_swizzled_layout(k_shared),
+                }
+            )
+
+            T.clear(dg_last_local_0)
+            T.clear(dg_last_local_1)
             T.clear(G_last_local)
             T.clear(G_shared)
             T.clear(q_fragment)
@@ -235,18 +232,10 @@ def kernel(
             T.clear(dw_fragment)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
-                T.copy(
-                    dO[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], dO_shared)
-                T.copy(
-                    h[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                      i_v * block_DV:(i_v + 1) * block_DV], h_shared)
-                T.copy(
-                    dh[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                       i_v * block_DV:(i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(dO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dO_shared)
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
 
                 if use_g:
                     T.clear(dg_last_fragment_scalar)
@@ -254,33 +243,25 @@ def kernel(
                     # for i_kv in T.Parallel(block_DK * block_DV):
                     #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     for i_kv in T.Parallel(block_DK * block_DV):
-                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv %
-                                                          block_DV] * dh_shared[i_kv // block_DV,
-                                                                                i_kv % block_DV]
+                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
-                    dg_last_local[0] += dg_last_fragment_scalar[0]
+                    dg_last_local_0 = dg_last_local_0 + dg_last_fragment_scalar[0]
 
                 T.gemm(dO_shared, V_shared, ds_fragment, transpose_B=True)
                 T.gemm(dO_shared, h_shared, dq_fragment, transpose_B=True)
                 T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True)
 
                 if use_dw:
-                    T.copy(
-                        dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                           i_v * block_DV:(i_v + 1) * block_DV], dv_shared)
+                    T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dv_shared)
                     T.gemm(dv_shared, h_shared, dw_fragment, transpose_B=True)
 
             if use_dw:
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
-                T.copy(
-                    dw_fragment, dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-
-            T.copy(Q[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   q_shared)
-            T.copy(K[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   k_shared)
+                T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], k_shared)
             T.copy(q_shared, q_fragment)
             T.copy(k_shared, k_fragment)
 
@@ -289,13 +270,12 @@ def kernel(
                 T.clear(dg_fragment_2)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     G_shared[i_s, i_k] = G[bb, bs * block_S + i_s, bh]
-                G_last_local[0] = G[bb, bs * block_S + block_S - 1, bh]
+                dg_last_local_0 = G[bb, bs * block_S + block_S - 1, bh]
                 # Use gmem directly instead of local register
-                dg_last_local[0] = dg_last_local[0] * T.exp(G[bb, bs * block_S + block_S - 1, bh])
+                dg_last_local_0 = dg_last_local_0 * T.exp(G[bb, bs * block_S + block_S - 1, bh])
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s,
-                                                                            bh]) * scale
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s, bh]) * scale
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dq_fragment[i_s, i_k] * q_shared[i_s, i_k]
@@ -303,12 +283,11 @@ def kernel(
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=-1, clear=False)
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    with T.If(G_last_local[0] - G[bb, bs * block_S + i_s, bh] <= 0):
-                        with T.Then():
-                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(
-                                G_last_local[0] - G[bb, bs * block_S + i_s, bh])
-                        with T.Else():
-                            dk_fragment[i_s, i_k] = 0
+                    dk_fragment[i_s, i_k] = (
+                        dk_fragment[i_s, i_k] * T.exp(G_last_local - G[bb, bs * block_S + i_s, bh])
+                        if G_last_local - G[bb, bs * block_S + i_s, bh] <= 0
+                        else 0
+                    )
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dk_fragment[i_s, i_k] * (-k_shared[i_s, i_k])
@@ -322,24 +301,20 @@ def kernel(
                     i_s, i_k = i_sk // block_DK, i_sk % block_DK
                     dg_last_fragment_2[i_sk] = dk_shared[i_s, i_k] * k_shared[i_s, i_k]
                 T.reduce_sum(dg_last_fragment_2, dg_last_fragment_scalar_2, dim=-1, clear=False)
-                dg_last_local[1] = dg_last_fragment_scalar_2[0]
+                dg_last_local_1 = dg_last_fragment_scalar_2[0]
 
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 >= i_s2 and
-                              G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
-                        with T.Then():
-                            ds_fragment[i_s1, i_s2] = ds_fragment[
-                                i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                    G[bb, bs * block_S + i_s2, bh]) * scale
-                        with T.Else():
-                            ds_fragment[i_s1, i_s2] = 0
+                    ds_fragment[i_s1, i_s2] = (
+                        (ds_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]) * scale)
+                        if G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0
+                        else 0
+                    )
 
                 T.clear(ds_fragment_positive)
                 T.clear(ds_fragment_positive_transpose)
                 T.gemm(q_shared, k_shared, ds_fragment_positive, transpose_B=True)
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    ds_fragment_positive[
-                        i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
+                    ds_fragment_positive[i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
 
                 # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
                 T.reduce_sum(ds_fragment_positive, dg_fragment, dim=1, clear=False)
@@ -361,25 +336,16 @@ def kernel(
                 T.gemm(ds_shared, q_shared, dk_fragment, transpose_A=True)
 
                 for i_s in T.Parallel(block_S):
-                    with T.If(i_s >= block_S - 1):  # noqa: SIM117
-                        with T.Then():
-                            dg_fragment_final[
-                                i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
-
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                    dg_fragment_final[i_s] = dg_fragment_final[i_s] + dg_last_local_0 + dg_last_local_1
+
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
                 for i_s in T.Parallel(block_S):
                     dg[bk, bb, bs * block_S + i_s, bh] = dg_fragment_final[i_s]
 
             else:
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 < i_s2):  # noqa: SIM117
-                        with T.Then():
-                            ds_fragment[i_s1, i_s2] = 0
+                    ds_fragment[i_s1, i_s2] = 0 if i_s1 < i_s2 else ds_fragment[i_s1, i_s2]
                 T.clear(dk_fragment_2)
                 T.copy(ds_fragment, ds_shared)
                 T.gemm(ds_shared, k_shared, dq_fragment)
@@ -387,41 +353,12 @@ def kernel(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale
                     dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] + dk_fragment_2[i_s, i_k] * scale
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
 
     return kernel
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def run_test(
     B,
     S,
@@ -442,33 +379,53 @@ def run_test(
     threads=256,
     num_stages=0,
 ):
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                    getattr(torch, output_dtype),
-                                                    getattr(torch, gate_dtype),
-                                                    getattr(torch, state_dtype), block_DK)
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype), block_DK)
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
 
     # ref
     if use_g:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
     else:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
 
     # tilelang
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
-                                        block_DK, block_DV, threads, num_stages)
-    print(kernel.get_kernel_source())
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_dw,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:
@@ -515,11 +472,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         # scale=1,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index d07a4776a..c16374fe8 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
 except ImportError:
@@ -56,9 +57,9 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
     H,
     DK,
     chunk_size=64,
-    input_dtype="bfloat16",
-    output_dtype="bfloat16",
-    accum_dtype="float32",
+    input_dtype=T.bfloat16,
+    output_dtype=T.bfloat16,
+    accum_dtype=T.float32,
     use_g=True,
     # kernel config
     block_S=64,
@@ -75,10 +76,10 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=accum_dtype),
-            A: T.Tensor(output_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=accum_dtype),
+        A: T.Tensor(output_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -93,20 +94,13 @@ def kernel(
             G_shared = T.alloc_shared((block_S,), dtype=accum_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
-
             T.fill(A_fragment, 0)
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     Beta_K_fragment[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
                 T.gemm(Beta_K_fragment, K_shared, A_fragment, transpose_B=True)
@@ -117,20 +111,18 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
-                        with T.Else():
-                            A_fragment[i_s1, i_s2] = 0
+                    A_fragment[i_s1, i_s2] = T.if_then_else(
+                        G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2,
+                        A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2]),
+                        0,
+                    )
             else:
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 <= i_s2):  # noqa: SIM117
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = 0
+                    if i_s1 <= i_s2:
+                        A_fragment[i_s1, i_s2] = 0
 
             T.copy(A_fragment, A_shared)
-            T.copy(A_shared, A[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(A_shared, A[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
@@ -149,24 +141,21 @@ def run_test(
     threads,
     num_stages,
 ):
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     A_ref = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
     A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
 
     # reference
     if use_g:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
     else:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
 
     # tilelang
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)
 
     try:
@@ -186,13 +175,14 @@ def main():
         H=32,
         DK=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
         use_g=True,
         block_DK=64,
         threads=128,
-        num_stages=2)
+        num_stages=2,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 9896c7ecf..0760b4964 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -10,6 +10,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.utils.cumsum import chunk_local_cumsum_scalar
 except ImportError:
@@ -20,11 +21,8 @@
 
 
 @tilelang.jit(
-    out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+)
 def tilelang_chunk_local_cumsum_scalar(
     # task config
     B,
@@ -34,43 +32,43 @@ def tilelang_chunk_local_cumsum_scalar(
     is_varlen=False,
     head_first=False,
     reverse=False,
-    input_dtype="float16",
-    output_dtype="float32",
+    input_dtype=T.float16,
+    output_dtype=T.float32,
     # kernel config
     block_S=64,
     threads=256,
     use_fragment=False,
 ):
     G_shape = (B, H, S) if head_first else (B, S, H)
-    assert chunk_size == 2**(chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
     assert chunk_size == block_S, "chunk_size must be equal to block_S"
 
     @T.prim_func
     def kernel(
-            G: T.Tensor(G_shape, dtype=input_dtype),
-            G_new: T.Tensor(G_shape, dtype=output_dtype),
+        G: T.Tensor(G_shape, dtype=input_dtype),
+        G_new: T.Tensor(G_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
             G_shared = T.alloc_shared((1, block_S), dtype=output_dtype, scope="shared")
             if head_first:
-                T.copy(G[bb, bh, bs * block_S:(bs + 1) * block_S], G_shared)
+                T.copy(G[bb, bh, bs * block_S : (bs + 1) * block_S], G_shared)
             else:
-                T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh], G_shared)
             if use_fragment:
                 G_fragment = T.alloc_fragment((1, block_S), dtype=output_dtype, scope="shared")
                 T.copy(G_shared, G_fragment)
                 T.cumsum(G_fragment, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_fragment, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_fragment, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_fragment, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_fragment, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
             else:
                 T.cumsum(G_shared, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_shared, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_shared, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_shared, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_shared, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
 
     return kernel
 
@@ -113,11 +111,8 @@ def run_test(
 
     # reference cumsum
     G_new_ref = chunk_local_cumsum_scalar(
-        g=G,
-        chunk_size=chunk_size,
-        reverse=reverse,
-        head_first=head_first,
-        output_dtype=getattr(torch, output_dtype))
+        g=G, chunk_size=chunk_size, reverse=reverse, head_first=head_first, output_dtype=getattr(torch, output_dtype)
+    )
 
     # tilelang cumsum
     block_S = chunk_size
@@ -159,10 +154,11 @@ def main():
         chunk_size=64,
         reverse=True,
         head_first=False,
-        input_dtype="float32",
-        output_dtype="float32",
+        input_dtype=T.float32,
+        output_dtype=T.float32,
         threads=256,
-        use_fragment=False)
+        use_fragment=False,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
index 0a0983a82..d36dcf9b7 100644
--- a/examples/gdn/example_wy_fast.py
+++ b/examples/gdn/example_wy_fast.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd
 except ImportError:
@@ -73,13 +74,13 @@ def tilelang_recompute_w_u_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=output_dtype),
-            W: T.Tensor(K_shape, dtype=output_dtype),
-            U: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -95,49 +96,37 @@ def kernel(
             W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
-                U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                }
+            )
 
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
                 T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(U_fragment, U_shared)
-                T.copy(
-                    U_shared, U[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    W_Beta_shared[i_s,
-                                  i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
                 T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(W_fragment, W_shared)
-                T.copy(
-                    W_shared, W[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
     return kernel
 
@@ -159,15 +148,8 @@ def run_test(
     num_stages,
 ):
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     W_ref, U_ref = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
     W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
 
@@ -191,7 +173,8 @@ def run_test(
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
@@ -217,14 +200,15 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        gate_dtype="float32",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
         block_DK=64,
         block_DV=32,
         threads=128,
-        num_stages=3)
+        num_stages=3,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 618a82b4c..822f745f2 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -10,6 +10,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import bwd_prepare_wy_repr
 except ImportError:
@@ -93,10 +94,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-5, -4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_wy_fast_bwd(
     # task config
     B,
@@ -135,20 +134,20 @@ def tilelang_wy_fast_bwd(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            # output
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -187,7 +186,7 @@ def kernel(
             T.clear(dbeta_fragment_v)
             T.clear(dg_fragment)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -195,51 +194,37 @@ def kernel(
 
             # Update dk
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    K_shared_beta_g[i_s,
-                                    i_k2] = K_shared[i_s,
-                                                     i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
-                T.copy(
-                    dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dw_shared)
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
                 T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, dw_shared, dk_fragment_beta_g, clear_accum=True, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dk_fragment[
-                        i_s,
-                        i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                    dk_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[
-                        i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
 
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dg_fragment[i_s] = dg_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dg_fragment_reduce_tmp[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[
-                        i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    dg_fragment_reduce_tmp[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    )
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=1, clear=False)
 
                 # correct dk
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dv
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
-                T.copy(
-                    du[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], du_shared)
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
                 T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
@@ -247,30 +232,22 @@ def kernel(
                 # for i_s, i_v2 in T.Parallel(block_S, block_DV):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
-                    dbeta_fragment_reduce_tmpv[i_s,
-                                               i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s,
-                                                                                              i_v2]
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment_v, dim=1, clear=False)
 
-                T.copy(
-                    dv_fragment, dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             # Temporary store dbeta, dg and dA
             for i_s in T.Parallel(block_S):
                 dbeta[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s] + dbeta_fragment_v[i_s]
                 dg[bb, bs * block_S + i_s, bh] = dg_fragment[i_s]
             # correct dA
-            T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_wy_fast_bwd_split(
     # task config
     B,
@@ -308,20 +285,20 @@ def tilelang_wy_fast_bwd_split(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
-            dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
+        dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -350,7 +327,7 @@ def kernel(
             T.clear(dA_A_fragment_1)
             T.clear(dA_A_fragment_2)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -361,34 +338,32 @@ def kernel(
             # for i_s in T.Parallel(block_S):
             # dbeta_fragment[i_s] = dbeta[bb, bs * block_S + i_s, bh]
             # dg_fragment[i_s] = dg[bb, bs * block_S + i_s, bh]
-            T.copy(dA[bb, bs * block_S:(bs + 1) * block_S, bh, :], dA_shared)
+            T.copy(dA[bb, bs * block_S : (bs + 1) * block_S, bh, :], dA_shared)
             # T.copy(dA_shared, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
 
             # Update dA
             T.copy(dA_shared, dA_fragment)
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 <= i_s2):  # noqa: SIM117
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] = 0
+                if i_s1 <= i_s2:
+                    dA_fragment[i_s1, i_s2] = 0
             T.copy(dA_fragment, dA_shared)
             T.gemm(dA_shared, A_shared, dA_fragment, clear_accum=True, transpose_B=True)
             T.copy(dA_fragment, dA_shared)
             T.gemm(A_shared, dA_shared, dA_fragment, clear_accum=True, transpose_A=True)
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 <= i_s2):
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] = 0
-                    with T.Else():
-                        dA_fragment[i_s1, i_s2] = -dA_fragment[i_s1, i_s2]
+                dA_fragment[i_s1, i_s2] = T.if_then_else(
+                    i_s1 <= i_s2,
+                    0,
+                    -dA_fragment[i_s1, i_s2],
+                )
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                         G[bb, bs * block_S + i_s2, bh])
-                    with T.Else():
-                        dA_fragment[i_s1, i_s2] = 0
+                dA_fragment[i_s1, i_s2] = T.if_then_else(
+                    G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0,
+                    dA_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]),
+                    0,
+                )
             T.copy(dA_fragment, dA_shared)
 
             # acceptable dA diff
@@ -397,12 +372,8 @@ def kernel(
             # Update dk using previous dk
             T.clear(A_fragment)
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dk_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
                 T.copy(dk_shared, dk_fragment)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     K_shared_beta[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
@@ -411,18 +382,14 @@ def kernel(
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s,
-                                               i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s,
-                                                                                              i_k2]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
                 T.gemm(dA_shared, K_shared_beta, dk_fragment, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_shared_beta[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_fragment[i_s, i_k2] = dk_fragment[i_s, i_k2] + dk_shared_beta[i_s, i_k2]
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dg and dbeta
             T.copy(A_fragment, A_shared)
@@ -460,19 +427,25 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
-    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, gate_dtype),
-                                                       getattr(torch, state_dtype))
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -480,28 +453,55 @@ def run_test(
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # ref
-    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(
-        K, V, G, Beta, A, dw, du, cu_seqlens=None)
+    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(K, V, G, Beta, A, dw, du, cu_seqlens=None)
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
+
+    from test_utils import assert_similar
 
-    from utils import assert_similar
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
@@ -517,11 +517,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         block_DK=32,
         block_DV=32,
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index e184dbcac..6f9fa5d2f 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -1,16 +1,16 @@
-import tilelang.testing
 import torch
+from tilelang import language as T
 
 B = 1
 S = 1024  # small but for test only.
 H = 32
 DK = 128
 DV = 128
-input_dtype = "bfloat16"
-output_dtype = "bfloat16"
-accum_dtype = "float32"
-gate_dtype = "float32"
-state_dtype = "float32"
+input_dtype = T.bfloat16
+output_dtype = T.bfloat16
+accum_dtype = T.float32
+gate_dtype = T.float32
+state_dtype = T.float32
 chunk_size = 64
 use_g = True
 use_initial_state = True
@@ -20,21 +20,15 @@
 block_DK = 64
 block_DV = 32
 threads = 128
-num_stages = 1
+num_stages = 0
 
 
 def test_example_wy_fast_compilation():
     from example_wy_fast import tilelang_recompute_w_u_fwd, prepare_input
+
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     # tilelang
     block_S = chunk_size
     kernel = tilelang_recompute_w_u_fwd(
@@ -52,22 +46,31 @@ def test_example_wy_fast_compilation():
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
 
 def test_example_wy_fast_bwd_split_compilation():
     from example_wy_fast_bwd_split import tilelang_wy_fast_bwd, tilelang_wy_fast_bwd_split, prepare_input, prepare_output
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
+
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -75,67 +78,146 @@ def test_example_wy_fast_bwd_split_compilation():
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
 
 
 def test_example_chunk_o_compilation():
     from example_chunk_o import tilelang_chunk_fwd_o, prepare_input
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype),
-                                       getattr(torch, output_dtype), getattr(torch, accum_dtype),
-                                       getattr(torch, gate_dtype))
+
+    Q, K, V, HIDDEN, G = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
     scale = 1.0 / DK**0.5
     block_S = chunk_size
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)  # noqa: F841
 
 
 def test_example_chunk_o_bwd_compilation():
     from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
-    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
-                                                                W)  # noqa: F841
+
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        True,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)  # noqa: F841
     if use_g:
         dg_tilelang = dg_tilelang.sum(dim=0)
 
 
 def test_example_chunk_scaled_dot_kkt_compilation():
     from example_chunk_scaled_dot_kkt import tilelang_chunk_scaled_dot_kkt_fwd, prepare_input
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)  # noqa: F841
 
 
 def test_example_cumsum_compilation():
     from example_cumsum import tilelang_chunk_local_cumsum_scalar, prepare_cumsum_input, prepare_cumsum_output
+
     G = prepare_cumsum_input(B, S, H, getattr(torch, gate_dtype))
     G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, gate_dtype))
     block_S = chunk_size
@@ -157,35 +239,82 @@ def test_example_cumsum_compilation():
 
 def test_example_chunk_delta_h_compilation():
     from example_chunk_delta_h import tilelang_chunk_gated_delta_rule_fwd_h, prepare_input
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
-    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G,
-                                                              initial_state)  # noqa: F841
+
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)  # noqa: F841
 
 
 def test_example_chunk_delta_bwd_compilation():
     from example_chunk_delta_bwd import tilelang_chunk_gated_delta_rule_bwd_dhu, prepare_input
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, 1.0, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)  # noqa: F841
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_example_chunk_delta_bwd_compilation()
diff --git a/examples/gdn/utils.py b/examples/gdn/test_utils.py
similarity index 68%
rename from examples/gdn/utils.py
rename to examples/gdn/test_utils.py
index 37f8d8e69..3588551ce 100644
--- a/examples/gdn/utils.py
+++ b/examples/gdn/test_utils.py
@@ -9,7 +9,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -19,21 +19,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if raise_assert:
             raise AssertionError
     else:
diff --git a/examples/gemm/README.md b/examples/gemm/README.md
index d7833c97d..9ab7fb661 100644
--- a/examples/gemm/README.md
+++ b/examples/gemm/README.md
@@ -53,7 +53,7 @@ import tilelang
 from tilelang import Profiler
 import tilelang.language as T
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -176,7 +176,7 @@ import tilelang.language as T
 # that helps align data for MMA (Matrix Multiply-Accumulate) operations.
 from tilelang.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -265,18 +265,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
diff --git a/examples/gemm/example_gemm.py b/examples/gemm/example_gemm.py
index f18cd388a..dfa431121 100644
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -58,5 +57,11 @@ def main():
     print(f"tilelang Latency: {latency}ms")
 
 
+def run_regression_perf():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 661ef1276..052bd64c6 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -51,9 +51,9 @@ def get_configs(M, N, K, with_roller=False, topk=20):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -90,7 +90,8 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -100,13 +101,19 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
 
-def get_best_config(M, N, K, with_roller=False):
-
+def get_best_config(
+    M,
+    N,
+    K,
+    with_roller: bool = False,
+    profile_backend: str = "event",
+):
     def kernel(
         block_M=None,
         block_N=None,
@@ -115,17 +122,16 @@ def kernel(
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "bfloat16"
-        accum_dtype = "float"
+        dtype = T.bfloat16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -146,15 +152,19 @@ def main(
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
+        )
+        .set_profile_args(
             supply_type=tl.TensorSupplyType.Integer,
             ref_prog=ref_program,
             skip_check=False,
+            backend=profile_backend,
         )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
@@ -167,52 +177,20 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tl.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           thread_num,
-           enable_rasteration,
-           dtype="float16",
-           accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_autotune(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -236,14 +214,22 @@ def gemm_autotune(
     return gemm_autotune
 
 
-def main(M: int = 4096,
-         N: int = 4096,
-         K: int = 4096,
-         use_autotune: bool = False,
-         with_roller: bool = False):
-    use_autotune = True
+def main(
+    M: int = 4096,
+    N: int = 4096,
+    K: int = 4096,
+    use_autotune: bool = False,
+    with_roller: bool = False,
+    profile_backend: str = "event",
+):
     if use_autotune:
-        result = get_best_config(M, N, K, with_roller)
+        result = get_best_config(
+            M,
+            N,
+            K,
+            with_roller=with_roller,
+            profile_backend=profile_backend,
+        )
         print(result.config)
         kernel = result.kernel
     else:
@@ -252,8 +238,13 @@ def main(M: int = 4096,
 
     # benchmark
     profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
-    tilelang_latency = profiler.do_bench()
-    ref_latency = profiler.do_bench(ref_program)
+    tilelang_latency = profiler.do_bench(
+        backend=profile_backend,
+    )
+    ref_latency = profiler.do_bench(
+        ref_program,
+        backend=profile_backend,
+    )
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
     print(f"TileLang latency: {tilelang_latency}")
     print(f"Ref latency: {ref_latency}")
@@ -261,20 +252,27 @@ def main(M: int = 4096,
     print(f"Ref TFlops: {2 * M * N * K / ref_latency * 1e-9}")
 
 
+def run_regression_perf(M: int = 4096, N: int = 4096, K: int = 4096):
+    config = get_heuristic_config()
+    kernel = matmul(M, N, K, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
     parser.add_argument("--m", type=int, default=4096, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=4096, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=4096, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=False,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=False, help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--profile_backend", type=str, default="event", help="Profiler backend")
     args = parser.parse_args()
-    main(args.m, args.n, args.k, args.use_autotune, args.with_roller)
+    main(
+        args.m,
+        args.n,
+        args.k,
+        args.use_autotune,
+        args.with_roller,
+        args.profile_backend,
+    )
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index 5c014ce3a..15e552587 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -4,8 +4,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
-from tilelang.transform import simplify_prim_func
+    TensorCoreIntrinEmitter,
+)
 
 
 def make_swizzle_layout(shared_buf):
@@ -24,7 +24,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,
@@ -34,18 +33,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -53,7 +52,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     chunk = 32
     shared_scope = "shared.dyn"
 
@@ -99,12 +98,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_intrinsics(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -112,10 +110,12 @@ def gemm_intrinsics(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -123,7 +123,6 @@ def gemm_intrinsics(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -133,7 +132,6 @@ def gemm_intrinsics(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -163,7 +161,7 @@ def ref_program(A, B):
 
 
 def main(M=4096, N=4096, K=4096):
-    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    in_dtype, out_dtype, accum_dtype = T.float16, T.float16, T.float32
     kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     src_code = kernel.get_kernel_source()
     # src_code is the generated cuda source
@@ -181,5 +179,12 @@ def main(M=4096, N=4096, K=4096):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main(M=4096, N=4096, K=4096)
diff --git a/examples/gemm/example_gemm_persistent.py b/examples/gemm/example_gemm_persistent.py
index a2a7122d3..ad3d556ed 100644
--- a/examples/gemm/example_gemm_persistent.py
+++ b/examples/gemm/example_gemm_persistent.py
@@ -5,22 +5,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_non_persistent(M,
-                          N,
-                          K,
-                          block_M,
-                          block_N,
-                          block_K,
-                          threads,
-                          num_stages,
-                          dtype="float16",
-                          accum_dtype="float"):
-
+def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -43,18 +33,9 @@ def main(
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_persistent(M,
-                      N,
-                      K,
-                      block_M,
-                      block_N,
-                      block_K,
-                      threads,
-                      num_stages,
-                      dtype="float16",
-                      accum_dtype="float",
-                      use_persistent_primitive=True):
-
+def matmul_persistent(
+    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32, use_persistent_primitive=True
+):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
     n_blocks = T.ceildiv(N, block_N)
@@ -63,9 +44,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -90,9 +71,9 @@ def main(
 
     @T.prim_func
     def main_persistent_primitive(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -100,8 +81,7 @@ def main_persistent_primitive(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            for bx, by in T.Persistent(
-                [T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
+            for bx, by in T.Persistent([T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
                 T.clear(C_local)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     T.copy(A[bx * block_M, k * block_K], A_shared)
@@ -128,18 +108,15 @@ def main(M=4096, N=4096, K=4096):
     num_stages = 3
 
     persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
-    persistent_profiler = persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Persistent GEMM: All check passed.")
     persistent_latency = persistent_profiler.do_bench(warmup=500)
     print(f"Persistent GEMM Latency: {persistent_latency} ms")
     print(f"Persistent GEMM TFlops: {total_flops / persistent_latency * 1e-9} TFlops")
 
-    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads,
-                                                  num_stages)
-    non_persistent_profiler = non_persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    non_persistent_profiler = non_persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     non_persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Non-Persistent GEMM: All check passed.")
     non_persistent_latency = non_persistent_profiler.do_bench(warmup=500)
@@ -149,11 +126,22 @@ def main(M=4096, N=4096, K=4096):
     print(f"Persistent GEMM Speedup: {non_persistent_latency / persistent_latency}")
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    BLOCK_M = 128
+    BLOCK_N = 256
+    BLOCK_K = 64
+    threads = 256
+    num_stages = 3
+    persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    return persistent_profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=8192, help='N dimension')
-    parser.add_argument('--K', type=int, default=8192, help='K dimension')
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=8192, help="N dimension")
+    parser.add_argument("--K", type=int, default=8192, help="K dimension")
     args = parser.parse_args()
     M, N, K = args.M, args.N, args.K
     main(M, N, K)
diff --git a/examples/gemm/example_gemm_schedule.py b/examples/gemm/example_gemm_schedule.py
index f4727412b..17dbcc568 100644
--- a/examples/gemm/example_gemm_schedule.py
+++ b/examples/gemm/example_gemm_schedule.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_schedule(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -65,5 +64,19 @@ def main():
     print(kernel.get_kernel_source())
 
 
+def run_regression_perf():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    import torch
+
+    a = torch.randn(1024, 1024).cuda().half()
+    b = torch.randn(1024, 1024).cuda().half()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm/regression_example_gemm.py b/examples/gemm/regression_example_gemm.py
new file mode 100644
index 000000000..3583cf16a
--- /dev/null
+++ b/examples/gemm/regression_example_gemm.py
@@ -0,0 +1,25 @@
+import tilelang.testing
+import example_gemm
+import example_gemm_autotune
+import example_gemm_intrinsics
+import example_gemm_schedule
+
+
+def regression_example_gemm_autotune():
+    tilelang.testing.process_func(example_gemm_autotune.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_gemm_intrinsics():
+    tilelang.testing.process_func(example_gemm_intrinsics.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_gemm_schedule():
+    tilelang.testing.process_func(example_gemm_schedule.run_regression_perf)
+
+
+def regression_example_gemm():
+    tilelang.testing.process_func(example_gemm.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_fp8/README.md b/examples/gemm_fp8/README.md
index 9d7011a06..2b3dc9560 100644
--- a/examples/gemm_fp8/README.md
+++ b/examples/gemm_fp8/README.md
@@ -1 +1 @@
-**Notes**: Now we only support fp8 with mma instructions instead of `T.gemm`, because the cutlass version of tilelang is too old, we should update the cutlass version in future.
\ No newline at end of file
+**Notes**: Now we only support fp8 with mma instructions instead of `T.gemm`, because the cutlass version of tilelang is too old, we should update the cutlass version in future.
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 0e6ace757..16a9d5f32 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -2,6 +2,7 @@
 import tilelang
 import tilelang.language as T
 from tilelang.utils.tensor import torch_assert_close
+from tilelang.utils import determine_fp8_type, determine_torch_fp8_type
 import itertools
 
 
@@ -17,10 +18,9 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = determine_torch_fp8_type()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     return [a, b]
 
 
@@ -35,40 +35,36 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks,
-                                                               num_stages, num_threads, k_packs,
-                                                               gemm_types):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "block_K": k,
-            "num_stages": stages,
-            "num_threads": t,
-            "k_pack": kp,
-            "gemm_type": gemm_type,
-        })
+    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks, num_stages, num_threads, k_packs, gemm_types):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+                "num_threads": t,
+                "k_pack": kp,
+                "gemm_type": gemm_type,
+            }
+        )
     return valid_configs
 
 
 @tilelang.autotune(
-    configs=get_configs(),
-    cache_input_tensors=True,
-    ref_prog=ref_program,
-    manual_check_prog=manual_check_prog,
-    supply_prog=supply_prog)
+    configs=get_configs(), cache_input_tensors=True, ref_prog=ref_program, manual_check_prog=manual_check_prog, supply_prog=supply_prog
+)
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = "float8_e4m3fnuz"
-    accum_dtype = "float"
+    dtype = determine_fp8_type()
+    accum_dtype = T.float32
 
     @T.prim_func
     def gemm_fp8_rs(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_local = T.alloc_fragment((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -77,24 +73,17 @@ def gemm_fp8_rs(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_local)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_local,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_local, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     @T.prim_func
     def gemm_fp8_ss(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -103,13 +92,7 @@ def gemm_fp8_ss(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
@@ -123,10 +106,9 @@ def gemm_fp8_ss(
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = determine_torch_fp8_type()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
new file mode 100644
index 000000000..fc7fb4400
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
@@ -0,0 +1,225 @@
+import torch
+import itertools
+import tilelang
+import tilelang.testing
+from tilelang import tvm as tvm
+import tilelang.language as T
+from tilelang.tileop.base import GemmWarpPolicy
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
+from tilelang.utils import determine_fp8_type
+
+tilelang.testing.set_random_seed(0)
+
+
+def get_configs():
+    block_Ms = [32, 64, 128]
+    block_Ns = [32, 64, 128]
+    block_Ks = [64, 128]
+    num_stages = [0, 1, 2]
+
+    valid_configs = []
+
+    for m, n, k, stages in itertools.product(block_Ms, block_Ns, block_Ks, num_stages):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+            }
+        )
+    return valid_configs
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(out_idx=[-1])
+def tl_matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    num_stages,
+    k_pack=2,
+    num_threads=256,
+    in_dtype=None,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
+    a_transposed=False,
+    b_transposed=True,
+):
+    if in_dtype is None:
+        in_dtype = determine_fp8_type()
+    b_preshuffle = True
+    warp_size = 64
+    num_warps = num_threads // warp_size
+
+    policy = GemmWarpPolicy.Square
+    m_warp, n_warp = policy.compute_warp_partition(block_M, block_N, num_warps)
+
+    shared_scope = "shared"
+    warp_row_tiles = block_M // m_warp
+    warp_col_tiles = block_N // n_warp
+
+    # MMA Wrapper to Auto Generate Code for MMA
+    mfma_emitter = MatrixCorePreshuffleIntrinEmitter(
+        a_dtype=in_dtype,
+        b_dtype=in_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        block_row_warps=m_warp,
+        block_col_warps=n_warp,
+        warp_row_tiles=warp_row_tiles,
+        warp_col_tiles=warp_col_tiles,
+        chunk=block_K,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+    )
+    local_size_a = mfma_emitter.local_size_a
+    local_size_b = mfma_emitter.local_size_b
+
+    warp_rows = mfma_emitter.warp_rows
+    warp_cols = mfma_emitter.warp_cols
+
+    micro_size_y = mfma_emitter.micro_size_y
+    micro_size_k = mfma_emitter.micro_size_k
+    pack_size_k = micro_size_k * k_pack
+
+    A_shape = (K, M) if a_transposed else (M, K)
+    A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
+
+    B_shape = (
+        (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+        if b_transposed
+        else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+    )
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
+            A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
+            B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzled_layout(A_shared),
+                    C_local: mfma_emitter.make_mfma_store_layout(C_local),
+                }
+            )
+
+            num_ko = K // block_K
+            num_ki = block_K // (k_pack * micro_size_k)
+
+            # Improve L2 Cache
+            # T.use_swizzle(panel_size=10)
+            T.clear(C_local)
+            for ko in T.Pipelined(num_ko, num_stages=num_stages):
+                # Load A into shared memory
+                if a_transposed:
+                    T.copy(A[ko * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                for ki in T.serial(0, num_ki):
+                    mfma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+                    mfma_emitter.ldmatrix_b(B_local, B, ki + ko * num_ki, pid_m=by, pid_n=bx)
+
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_local, B_local, C_local, ki)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def shuffle_weight(
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
+) -> torch.Tensor:
+    IN, IK = layout
+    BK = IK * k_pack
+    BN = IN
+
+    N, K = (x.shape[-2], x.shape[-1]) if is_transpose else (x.shape[-1], x.shape[-2])
+    assert N % BN == 0
+    assert K % BK == 0
+
+    x = x.view(N // BN, BN, K // BK, BK) if is_transpose else x.view(K // BK, BK, N // BN, BN)
+    x = x.permute(0, 2, 1, 3)
+    return x.contiguous()
+
+
+def assert_tl_matmul_correctness(M, N, K, k_pack=1, a_transposed=False, b_transposed=True):
+    in_dtype = determine_fp8_type()
+    out_dtype = T.float32
+    accum_dtype = T.float32
+    kernel = tl_matmul(
+        M,
+        N,
+        K,
+        k_pack=k_pack,
+        in_dtype=in_dtype,
+        out_dtype=out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+    )
+
+    src_code = kernel.get_kernel_source()
+    # src_code is the generated cuda source
+    assert src_code is not None
+    A_shape = (K, M) if a_transposed else (M, K)
+    B_shape = (N, K) if b_transposed else (K, N)
+
+    A = (torch.rand(A_shape, device="cuda", dtype=torch.float16) / 10).to(getattr(torch, in_dtype))
+    B = (torch.rand(B_shape, device="cuda", dtype=torch.float16) / 10).to(getattr(torch, in_dtype))
+
+    B_preshuffle = shuffle_weight(B, k_pack=k_pack, is_transpose=b_transposed)
+    C = kernel(A, B_preshuffle)
+
+    profiler = kernel.get_profiler()
+    latency = profiler.do_bench()
+
+    # Ensure that the latency is not None
+    assert latency is not None
+    print("time: ", latency)
+
+    if a_transposed and b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.T.half(), B.T.half()).to(getattr(torch, out_dtype))
+    elif a_transposed and not b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.T.half(), B.half()).to(getattr(torch, out_dtype))
+    elif not a_transposed and b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.half(), B.T.half()).to(getattr(torch, out_dtype))
+    else:
+        # Get Reference Result
+        ref_c = torch.matmul(A.half(), B.half()).to(getattr(torch, out_dtype))
+
+    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
+
+
+def test_assert_tl_matmul():
+    assert_tl_matmul_correctness(512, 512, 512, k_pack=2)
+
+
+if __name__ == "__main__":
+    test_assert_tl_matmul()
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index a403ed068..3b575c78e 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -1,7 +1,7 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
+from tilelang.utils import determine_fp8_type
 
 
 def calc_diff(x, y):
@@ -12,13 +12,12 @@ def calc_diff(x, y):
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     @T.prim_func
     def gemm_fp8(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -37,12 +36,12 @@ def gemm_fp8(
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.randn(M, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
-    b = torch.randn(N, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
+    a = torch.randn(M, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
+    b = torch.randn(N, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
@@ -57,8 +56,24 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 1024, determine_fp8_type())
+    test_gemm_fp8(1024, 1024, 1024, determine_fp8_type("e5m2"))
+
+
+def run_regression_perf():
+    M, N, K = 4096, 4096, 4096
+    dtype = determine_fp8_type()
+    kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+        dtype = determine_fp8_type("e5m2")
+        kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+        profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+        latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+        return (latency_e4m3 + latency_e5m2) / 2
+    latency_e4m3 = profiler_e4m3.do_bench()
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index 1d9207aff..39c6fc333 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -1,11 +1,11 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
+from tilelang.utils import determine_fp8_type
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     # for fp8 gemm, do one promote after 4 wgmma inst, i.e. block_K = 128.
     # if block_K < 128, promote after 128/block_K iters.
     # if block_K > 128, promote after every iter.
@@ -13,9 +13,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
     @T.prim_func
     def gemm_fp8_2xAcc(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -55,18 +55,18 @@ def calc_diff(x, y):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.rand(M, K, dtype=torch.float16, device='cuda')
+    a = torch.rand(M, K, dtype=torch.float16, device="cuda")
     a = (100 * (2 * a - 1)).to(dtype=torch_dtype)
-    b = torch.rand(N, K, dtype=torch.float16, device='cuda')
+    b = torch.rand(N, K, dtype=torch.float16, device="cuda")
     b = (100 * (2 * b - 1)).to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
-    ref_c = (a.float() @ b.float().T)
+    ref_c = a.float() @ b.float().T
 
     diff = calc_diff(c, ref_c)
     print(f"diff: {diff}")
@@ -74,8 +74,26 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 8192, determine_fp8_type())
+    test_gemm_fp8(1024, 1024, 8192, determine_fp8_type("e5m2"))
+
+
+def run_regression_perf():
+    M, N, K = 1024, 1024, 8192
+    dtype = determine_fp8_type()
+    kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    else:
+        latency_e4m3 = profiler_e4m3.do_bench()
+    if torch.version.hip is None:
+        dtype = determine_fp8_type("e5m2")
+        kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+        profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+        latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+        return (latency_e4m3 + latency_e5m2) / 2
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index ed44aab69..1015a7463 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -4,10 +4,10 @@
 from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
-from tilelang.transform import simplify_prim_func
+from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
+from tilelang.intrinsics.mfma_macro_generator import MatrixCoreIntrinEmitter
 from tilelang.utils.tensor import map_torch_type
+from tilelang.utils import determine_fp8_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -28,7 +28,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,
@@ -38,29 +37,25 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
-    ], "Currently only float16 and int8 are supported"
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
+        T.float8_e5m2,
+        T.float8_e5m2fnuz,
+        T.int8,
+    ], "Currently only float16, float8, and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
-        micro_size_k = 32
-
     # This is a debug config
     block_row_warps = 2
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -74,6 +69,38 @@ def tl_matmul(
     B_shape = (N, K)
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, block_K)
+    is_hip = torch.version.hip is not None
+    # MMA Wrapper to Auto Generate Code for MMA/MFMA
+    if is_hip:
+        mma_emitter = MatrixCoreIntrinEmitter(
+            a_dtype=in_dtype,
+            b_dtype=in_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=False,
+            b_transposed=True,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+        )
+    else:
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=in_dtype,
+            b_dtype=in_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=False,
+            b_transposed=True,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+        )
+
+    micro_size_x = mma_emitter.M_DIM
+    micro_size_y = getattr(mma_emitter, "n_dim", getattr(mma_emitter, "N_DIM", micro_size_x))
+    micro_size_k = mma_emitter.k_dim
     C_shared_shape = (
         block_M // micro_size_x,
         block_N // micro_size_y,
@@ -81,36 +108,20 @@ def tl_matmul(
         micro_size_y,
     )
 
-    warp_size = 32
-    threads = warp_size * (block_row_warps * block_col_warps)
-    local_size_a = (micro_size_x * micro_size_k) // warp_size
-    local_size_b = (micro_size_y * micro_size_k) // warp_size
-    local_size_c = (micro_size_x * micro_size_y) // warp_size
-    warp_rows = warp_row_tiles // micro_size_x
-    warp_cols = warp_col_tiles // micro_size_y
-
-    # MMA Wrapper to Auto Generate Code for MMA
-    mma_emitter = TensorCoreIntrinEmitter(
-        a_dtype=in_dtype,
-        b_dtype=in_dtype,
-        accum_dtype=accum_dtype,
-        a_transposed=False,
-        b_transposed=True,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-    )
+    threads = mma_emitter.threads
+    local_size_a = mma_emitter.local_size_a
+    local_size_b = mma_emitter.local_size_b
+    local_size_c = mma_emitter.local_size_out
+    warp_rows = mma_emitter.warp_rows
+    warp_cols = mma_emitter.warp_cols
 
     @T.prim_func
     def gemm_fp8_intrinsic(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +129,12 @@ def gemm_fp8_intrinsic(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +142,6 @@ def gemm_fp8_intrinsic(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +151,6 @@ def gemm_fp8_intrinsic(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -155,7 +166,10 @@ def gemm_fp8_intrinsic(
                     )
 
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local)
+                    if is_hip:
+                        mma_emitter.mfma(A_local, B_local, C_local, ki)
+                    else:
+                        mma_emitter.mma(A_local, B_local, C_local)
 
             # Perform STMatrix
             mma_emitter.stmatrix(
@@ -189,7 +203,12 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
         B = torch.randint(-128, 128, (N, K), dtype=torch.int8).to(in_dtype).cuda()
-    elif in_dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
+    elif in_dtype in {
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+    }:
         A = torch.randn(M, K).to(in_dtype).cuda()
         B = torch.randn(N, K).to(in_dtype).cuda()
     else:
@@ -215,8 +234,24 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    e4m3_dtype = determine_fp8_type()
+    assert_tl_matmul_correctness(128, 128, 128, e4m3_dtype, T.float32, T.float32)
+    e5m2_dtype = determine_fp8_type("e5m2")
+    assert_tl_matmul_correctness(128, 128, 128, e5m2_dtype, T.float32, T.float32)
+
+
+def run_regression_perf():
+    M, N, K = 4096, 4096, 4096
+    out_dtype, accum_dtype = "float32", "float32"
+    in_dtype = determine_fp8_type()
+    kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
+    print(kernel_e4m3.get_kernel_source())
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    else:
+        latency_e4m3 = profiler_e4m3.do_bench()
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
new file mode 100644
index 000000000..aa7e8b360
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -0,0 +1,124 @@
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm_v2(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=(k == 0),
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+M, N, K = 4096, 4096, 8192
+block_M, block_N, block_K = 64, 256, 32
+trans_A, trans_B = False, True
+num_stages = 2
+threads = 256
+for tvm_fp8_dtype in [T.float8_e4m3fn, T.float8_e5m2]:
+    for tvm_acc_dtype in [T.float16, T.float32]:  # , torch.float16]:
+        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
+        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
+        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
+
+        func = matmul(
+            M,
+            N,
+            K,
+            block_M,
+            block_N,
+            block_K,
+            trans_A,
+            trans_B,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            threads,
+        )
+        jit_kernel = tilelang.compile(
+            func,
+            out_idx=[2],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
+            },
+        )
+        # jit_kernel.export_ptx("./dump.ptx")
+        # jit_kernel.export_sources("./dump.cu")
+
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+
+        c = jit_kernel(a, b)
+        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
+        c = c.float()
+        diff = calc_diff(c, ref_c)
+        # assert diff < 1e-3, f"{diff}"
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
+
+        profiler = jit_kernel.get_profiler()
+        latency = profiler.do_bench()
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_fp8/regression_example_gemm_fp8.py b/examples/gemm_fp8/regression_example_gemm_fp8.py
new file mode 100644
index 000000000..3ba2f4f27
--- /dev/null
+++ b/examples/gemm_fp8/regression_example_gemm_fp8.py
@@ -0,0 +1,20 @@
+import tilelang.testing
+import example_tilelang_gemm_fp8
+import example_tilelang_gemm_fp8_2xAcc
+import example_tilelang_gemm_fp8_intrinsic
+
+
+def regression_example_tilelang_gemm_fp8_2xAcc():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8_2xAcc.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_fp8_intrinsic():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8_intrinsic.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_fp8():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_sm100/README.md b/examples/gemm_sm100/README.md
index 73dd76c30..d630d2d0d 100644
--- a/examples/gemm_sm100/README.md
+++ b/examples/gemm_sm100/README.md
@@ -40,19 +40,19 @@ import tilelang.language as T
 
 @T.prim_func
 def main(
-    A: T.Tensor((M, K), "bfloat16"),
-    B: T.Tensor((N, K), "bfloat16"),
-    C: T.Tensor((M, N), "bfloat16"),
+    A: T.Tensor((M, K), T.bfloat16),
+    B: T.Tensor((N, K), T.bfloat16),
+    C: T.Tensor((M, N), T.bfloat16),
 ):
     with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
         # 1. Allocate memory buffers
-        A_shared = T.alloc_shared((block_M, block_K), "bfloat16")  # A matrix shared memory
-        B_shared = T.alloc_shared((block_N, block_K), "bfloat16")  # B matrix shared memory
-        C_tmem = T.alloc_tmem([block_M, block_N], "float")         # TCGEN5MMA output to Tensor Memory
+        A_shared = T.alloc_shared((block_M, block_K), T.bfloat16)  # A matrix shared memory
+        B_shared = T.alloc_shared((block_N, block_K), T.bfloat16)  # B matrix shared memory
+        C_tmem = T.alloc_tmem([block_M, block_N], T.float)         # TCGEN5MMA output to Tensor Memory
         mbar = T.alloc_barrier(1)                                 # mbarrier synchronization primitive
 
-        C_local = T.alloc_fragment((block_M, block_N), "float")   # Register storage
-        C_shared = T.alloc_shared((block_M, block_N), "bfloat16") # Output shared memory
+        C_local = T.alloc_fragment((block_M, block_N), T.float)   # Register storage
+        C_shared = T.alloc_shared((block_M, block_N), T.bfloat16) # Output shared memory
 
         # 2. Main computation loop
         for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
@@ -103,4 +103,3 @@ latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
 print(f"Performance: {2 * M * N * K / (latency/1e3) / 1e12:.2f} TFLOPS")
 ```
-
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index a58e5a7c0..226e33c01 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -62,7 +61,8 @@ def main(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 print(jit_kernel.get_kernel_source())
 # 3. Test the kernel in Python with PyTorch data
 import torch
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 9008c7ef5..d3f384e98 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -25,9 +25,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -38,17 +38,9 @@ def main(
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.copy(A[by * block_M, k * block_K], A_shared)  # not trans_A
+                T.copy(B[bx * block_N, k * block_K], B_shared)  # trans_B
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -60,14 +52,13 @@ def main(
 
 
 M, N, K = 4096, 4096, 8192
-block_M, block_N, block_K = 128, 256, 128
+block_M, block_N, block_K = 128, 128, 128
 trans_A, trans_B = False, True
-in_dtype, out_dtype, accum_dtype = "bfloat16", "bfloat16", "float"
-num_stages = 2
+in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+num_stages = 0 if block_N >= 256 or block_M >= 256 or block_K >= 256 else 2
 threads = 256
 
-func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-              accum_dtype, num_stages, threads)
+func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 jit_kernel = tilelang.compile(
     func,
     out_idx=[2],
@@ -75,7 +66,8 @@ def main(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 
 print(jit_kernel.get_kernel_source())
 
@@ -88,4 +80,4 @@ def main(
 profiler = jit_kernel.get_profiler()
 latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
-print(f"Flops: {2 * M * N * K / (latency/1e3) / 1e12} TFLOPS")
+print(f"Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
new file mode 100644
index 000000000..4b03ae83d
--- /dev/null
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -0,0 +1,342 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.sparse import randn_semi_sparse
+from tilelang.utils.tensor import torch_assert_close
+
+from tilelang.profiler import do_bench
+
+import torch
+
+torch.manual_seed(42)
+
+DEFAULT_CONFIG = {  # take best config from autotune script
+    "4090": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+    "h20": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+}
+
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul_sp_fp16_custom_compress(
+    M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization, use_cutlass_layout
+):
+    e_factor, e_dtype = (16, T.int16)
+
+    @T.prim_func
+    def gemm_sp_fp16_custom_compress(
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(C_local)
+            T.disable_warp_group_reg_alloc()
+            T.use_swizzle(panel_size=10, enable=enable_rasterization)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_local, False, False, policy=policy)
+
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return gemm_sp_fp16_custom_compress
+
+
+def torch_compress(dense):
+    """
+    A naive compression function, where each 4-bit meta matches 4 elements in original matrix in row major layout.
+    """
+    if dense.dim() != 2:
+        raise RuntimeError(f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
+
+    m, k = dense.shape
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError("Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}")
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, _m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, _m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12)
+    elif quadbits_per_meta_elem == 8:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28)
+        )
+
+    return (sparse, meta)
+
+
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4  # 4 groups per uint16
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+
+
+@tilelang.jit(
+    out_idx=[1, 2],
+    pass_configs={
+        tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+    },
+)
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            # TODO: alloc_var seems buggy here
+            non_zero_cnt = T.alloc_local((1,), dtype=T.uint8)
+            non_zero_elt_log_idx = T.alloc_local((elem,), dtype=T.uint8)
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    non_zero_cnt[0] = 0
+                    for i in range(elem):
+                        non_zero_elt_log_idx[i] = 0
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    # TODO: use T.device_assert(non_zero_cnt <= 2) after rebasing main
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+
+
+def main(M=1024, N=1024, K=1024, use_cutlass_layout=False, use_torch_compressor=False, accum_dtype=T.float, cfg="4090"):
+    kernel = matmul_sp_fp16_custom_compress(M, N, K, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype], use_cutlass_layout=use_cutlass_layout)
+
+    a = randn_semi_sparse(M, K, device="cuda", dtype=torch.half)
+    b = torch.randn(K, N, device="cuda", dtype=torch.half)
+
+    if use_torch_compressor:
+        assert not use_cutlass_layout, "torch sparse must be used with naive layout"
+        a_sparse, e = torch_compress(a)
+    else:
+        a_sparse, e = compress_kernel(M, K, 32, 32, T.float16, use_cutlass_layout=use_cutlass_layout)(a)
+
+    c = kernel(a_sparse, e, b)
+
+    ref_c = a @ b
+
+    assert not c.isnan().any(), "Reference result contains NaNs, please report an issue"
+    torch_assert_close(c, ref_c.to(c.dtype), rtol=1e-3, atol=1e-3)
+    print(f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}")
+
+    latency = do_bench(lambda: kernel(a_sparse, e, b))
+    ref_latency = do_bench(lambda: a @ b)
+
+    total_flops = 2 * M * N * K
+    tflops = total_flops / latency / 1e9
+    ref_tflops = total_flops / ref_latency / 1e9
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
+    parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
+    args = parser.parse_args()
+    main(
+        M=args.m,
+        N=args.n,
+        K=args.k,
+        use_cutlass_layout=args.use_cutlass_layout,
+        use_torch_compressor=args.use_torch_compressor,
+        accum_dtype=args.accum_dtype,
+        cfg=args.cfg,
+    )
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 505f2b883..769ea6736 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -1,99 +1,90 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 import argparse
 
 import tilelang
 import tilelang.language as T
 
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.utils.sparse import compress, randn_semi_sparse
 from tilelang.contrib import nvcc
-from triton.testing import do_bench
+from tilelang.profiler import do_bench
 
 import torch
 
 arch = nvcc.get_target_compute_version()
 
-ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
-
-default_config = {  # take best config from autotune script
+DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 64,
-            'num_stages': 1,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 256,
-            'block_N': 128,
-            'block_K': 64,
-            'num_stages': 2,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
     },
     "h20": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
-    }
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
 }
 
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
 
 @tilelang.jit(out_idx=[-1])
-def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy,
-                   enable_rasterization):
+def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization):
     e_factor, e_dtype = ARCH_INFO[arch]
 
     @T.prim_func
     def gemm_sp_fp16(
-            A_sparse: T.Tensor((M, K // 2), 'float16'),
-            E: T.Tensor((M, K // e_factor), e_dtype),
-            B: T.Tensor((K, N), 'float16'),
-            C: T.Tensor((M, N), accum_dtype),
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
             T.disable_warp_group_reg_alloc()
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", backend="cutlass", block_k=block_K, arch=arch),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        backend="cutlass",
-                        block_k=block_K,
-                        arch=arch),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
                 T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
@@ -106,30 +97,13 @@ def gemm_sp_fp16(
     return gemm_sp_fp16
 
 
-def main():
-    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
-    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], required=True)
-    args = parser.parse_args()
-    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype,
-                            **default_config[args.cfg][args.accum_dtype])
+def main(M=1024, N=1024, K=1024, accum_dtype=T.float, cfg="h20"):
+    kernel = matmul_sp_fp16(M, N, K, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype])
 
-    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
-    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+    a = randn_semi_sparse(M, K, device="cuda", dtype=torch.half)
+    b = torch.randn(K, N, device="cuda", dtype=torch.half)
 
-    a_sparse, e = compress(
-        a,
-        transposed=False,
-        block_k=default_config[args.cfg][args.accum_dtype]['block_K'],
-        arch=arch)
+    a_sparse, e = compress(a, transposed=False, block_k=DEFAULT_CONFIG[cfg][accum_dtype]["block_K"], arch=arch)
     c = kernel(a_sparse, e, b)
 
     ref_c = a @ b
@@ -141,12 +115,19 @@ def main():
     latency = do_bench(lambda: kernel(a_sparse, e, b))
     ref_latency = do_bench(lambda: a @ b)
 
-    total_flops = 2 * args.m * args.n * args.k
+    total_flops = 2 * M * N * K
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
-    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
-    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
+    args = parser.parse_args()
+    main(M=args.m, N=args.n, K=args.k, accum_dtype=args.accum_dtype, cfg=args.cfg)
diff --git a/examples/gemm_sp/test_example_gemm_sp.py b/examples/gemm_sp/test_example_gemm_sp.py
new file mode 100644
index 000000000..fe26df144
--- /dev/null
+++ b/examples/gemm_sp/test_example_gemm_sp.py
@@ -0,0 +1,16 @@
+import tilelang.testing
+
+import example_custom_compress
+import example_gemm_sp
+
+
+def test_example_custom_compress():
+    example_custom_compress.main()
+
+
+def test_example_gemm_sp():
+    example_gemm_sp.main()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index c96669711..64ffade8e 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -3,27 +3,16 @@
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
@@ -67,5 +56,28 @@ def main():
     torch.testing.assert_close(c, ref_c.to(c.dtype), rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    M = 4096
+    N = 4096
+    K = 4096
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    split_k = 4
+    kernel = matmul(M, N, K, block_M, block_N, block_K, split_k)
+    import torch
+
+    torch.random.manual_seed(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+    c = torch.zeros(M, N).cuda().float()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b, c)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
index 145d622ed..3d33478cf 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -3,27 +3,16 @@
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
@@ -66,5 +55,29 @@ def main():
     torch.testing.assert_close(c, ref_c.to(c.dtype), rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    M = 4096
+    N = 4096
+    K = 4096
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    split_k = 4
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K, split_k)
+    import torch
+
+    torch.random.manual_seed(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+    c = torch.zeros(M, N).cuda().float()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b, c)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_splitk/regression_example_gemm_splitk.py b/examples/gemm_splitk/regression_example_gemm_splitk.py
new file mode 100644
index 000000000..c76b7e55c
--- /dev/null
+++ b/examples/gemm_splitk/regression_example_gemm_splitk.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_tilelang_gemm_splitk
+import example_tilelang_gemm_splitk_vectorize_atomicadd
+
+
+def regression_example_tilelang_gemm_splitk():
+    tilelang.testing.process_func(example_tilelang_gemm_splitk.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_splitk_vectorize_atomicadd():
+    tilelang.testing.process_func(example_tilelang_gemm_splitk_vectorize_atomicadd.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index 31cf40647..b2e8e9369 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -39,7 +39,7 @@ def cdiv(a, b):
 
 # Two-tile SK + DP
 streamk_tiles = total_tiles % streamk_programs
-if (total_tiles - streamk_tiles > streamk_programs):  # (total_tiles // total_programs > 1)
+if total_tiles - streamk_tiles > streamk_programs:  # (total_tiles // total_programs > 1)
     streamk_tiles += streamk_programs
 
 blocking_tiles = total_tiles - streamk_tiles
@@ -77,95 +77,71 @@ def tl_matmul_streamk(
     A_shared_shape = (block_M, block_K) if not trans_A else (block_K, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    @T.macro
-    def compute_first_wave(
-        pid: T.int32,
-        A_buf: T.Tensor,
-        A_buf_shared: T.SharedBuffer,
-        B_buf: T.Tensor,
-        B_buf_shared: T.SharedBuffer,
-        C: T.Tensor,
-        C_local: T.LocalBuffer,
-    ):
-        start_iter = T.alloc_fragment((1,), "int32", "local")
-        end_iter = T.alloc_fragment((1,), "int32", "local")
-
-        start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
-        last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
-
-        while start_iter[0] < last_iter:
-            end_iter[0] = T.min(
-                start_iter[0] + (iters_per_tile - (start_iter[0] % iters_per_tile)),
-                last_iter,
-            )
-
-            tile_id = start_iter[0] // iters_per_tile
-            remain_iters = start_iter[0] % iters_per_tile
-            pid_m = tile_id // T.ceildiv(N, block_N)
-            pid_n = tile_id % T.ceildiv(N, block_N)
-
-            T.clear(C_local)
-            for k in T.Pipelined(end_iter[0] - start_iter[0], num_stages=num_stages):
-                T.copy(
-                    A_buf[pid_m * block_M, (k + (start_iter[0] % iters_per_tile)) * block_K],
-                    A_buf_shared,
-                )
-                T.copy(
-                    B_buf[pid_n * block_N, (k + (start_iter[0] % iters_per_tile)) * block_K],
-                    B_buf_shared,
-                )
-                T.gemm(A_buf_shared, B_buf_shared, C_local, transpose_B=trans_B)
-
-            # last iteration of the tile always happens before its start on another SM
-            if remain_iters == 0 and (end_iter[0] % iters_per_tile == 0):
-                T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
-            else:
-                for i, j in T.Parallel(block_M, block_N):
-                    T.atomic_add(C[pid_m * block_M + i, pid_n * block_N + j], C_local[i, j])
-
-            start_iter[0] = end_iter[0]
-
-    @T.macro
-    def compute_full_tiles(
-        pid: T.int32,
-        A_buf: T.Tensor,
-        A_shared: T.SharedBuffer,
-        B_buf: T.Tensor,
-        B_shared: T.SharedBuffer,
-        C: T.Tensor,
-        C_local: T.LocalBuffer,
-    ):
-
-        for p in T.serial(sm_patition_factor):
-            tile_id = pid + streamk_tiles + p * total_sm
-            pid_m = tile_id // T.ceildiv(N, block_N)
-            pid_n = tile_id % T.ceildiv(N, block_N)
-            T.clear(C_local)
-
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
-                T.copy(A_buf[pid_m * block_M, k * block_K], A_shared)
-                T.copy(B_buf[pid_n * block_N, k * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_local, transpose_B=trans_B)
-            T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, dtypeAB),
-            B: T.Tensor(B_shape, dtypeAB),
-            C: T.Tensor((M, N), dtypeC),
+        A: T.Tensor(A_shape, dtypeAB),
+        B: T.Tensor(B_shape, dtypeAB),
+        C: T.Tensor((M, N), dtypeC),
     ):
         with T.Kernel(streamk_programs, threads=threads) as pid:
-
             A_shared = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared = T.alloc_shared(B_shared_shape, dtypeAB)
             A_shared_full_tiles = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared_full_tiles = T.alloc_shared(B_shared_shape, dtypeAB)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
-            compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+            # compute first wave
+            start_iter = T.alloc_fragment((1,), T.int32, "local")
+            end_iter = T.alloc_fragment((1,), T.int32, "local")
+
+            start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
+            last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
 
+            while start_iter[0] < last_iter:
+                end_iter[0] = T.min(
+                    start_iter[0] + (iters_per_tile - (start_iter[0] % iters_per_tile)),
+                    last_iter,
+                )
+
+                tile_id = start_iter[0] // iters_per_tile
+                remain_iters = start_iter[0] % iters_per_tile
+                pid_m = tile_id // T.ceildiv(N, block_N)
+                pid_n = tile_id % T.ceildiv(N, block_N)
+
+                T.clear(C_local)
+                for k in T.Pipelined(end_iter[0] - start_iter[0], num_stages=num_stages):
+                    T.copy(
+                        A[pid_m * block_M, (k + (start_iter[0] % iters_per_tile)) * block_K],
+                        A_shared,
+                    )
+                    T.copy(
+                        B[pid_n * block_N, (k + (start_iter[0] % iters_per_tile)) * block_K],
+                        B_shared,
+                    )
+                    T.gemm(A_shared, B_shared, C_local, transpose_B=trans_B)
+
+                # last iteration of the tile always happens before its start on another SM
+                if remain_iters == 0 and (end_iter[0] % iters_per_tile == 0):
+                    T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        T.atomic_add(C[pid_m * block_M + i, pid_n * block_N + j], C_local[i, j])
+
+                start_iter[0] = end_iter[0]
+
+            # compute full tiles
             if sm_patition_factor > 0:
-                compute_full_tiles(pid, A, A_shared_full_tiles, B, B_shared_full_tiles, C, C_local)
+                for p in T.serial(sm_patition_factor):
+                    tile_id = pid + streamk_tiles + p * total_sm
+                    pid_m = tile_id // T.ceildiv(N, block_N)
+                    pid_n = tile_id % T.ceildiv(N, block_N)
+                    T.clear(C_local)
+
+                    for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                        T.copy(A[pid_m * block_M, k * block_K], A_shared_full_tiles)
+                        T.copy(B[pid_n * block_N, k * block_K], B_shared_full_tiles)
+                        T.gemm(A_shared_full_tiles, B_shared_full_tiles, C_local, transpose_B=trans_B)
+                    T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
 
     return main
 
@@ -181,9 +157,9 @@ def main():
         BLOCK_SIZE_K,
         False,
         True,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         2,
         64,
     )
@@ -201,5 +177,30 @@ def main():
     torch.testing.assert_close(C, b_c, rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    kernel = tl_matmul_streamk(
+        m,
+        n,
+        k,
+        streamk_tiles,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+        False,
+        True,
+        "float16",
+        "float16",
+        "float32",
+        2,
+        64,
+    )
+    b_c = torch.zeros((m, n), device="cuda", dtype=torch.float16)
+    torch.cuda.synchronize()
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(lambda: kernel(A, B, b_c), backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_streamk/test_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/test_example_tilelang_gemm_streamk.py
similarity index 100%
rename from examples/gemm_streamk/test_example_tilelang_gemm_splitk.py
rename to examples/gemm_streamk/test_example_tilelang_gemm_streamk.py
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 4e43dcd9a..a5ecffbd0 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -17,15 +17,14 @@ def naive_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N)) as bn:
             tn = T.get_thread_binding(0)  # tn = threadIdx.x
@@ -38,8 +37,7 @@ def main(
                     A_shared[tk] = A[bk * BLOCK_K + tk]
                     B_shared[tn, tk] = B[bn * BLOCK_N + tn, bk * BLOCK_K + tk]
                 for tk in T.serial(BLOCK_K):
-                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn,
-                                                                            tk].astype(accum_dtype)
+                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn, tk].astype(accum_dtype)
             C[bn * BLOCK_N + tn] = C_reg[0]
 
     return main
@@ -51,15 +49,14 @@ def naive_splitk_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, BLOCK_K)) as bn:
             tn = T.get_thread_binding(0)
@@ -88,16 +85,16 @@ def splitk_gemv(
     BLOCK_N: int,
     BLOCK_K: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     TILE_K = T.ceildiv(BLOCK_K, reduce_threads)
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -127,8 +124,8 @@ def splitk_gemv_vectorized(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -136,9 +133,9 @@ def splitk_gemv_vectorized(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -168,8 +165,8 @@ def splitk_gemv_vectorized_tvm(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -177,9 +174,9 @@ def splitk_gemv_vectorized_tvm(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -197,9 +194,9 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -209,7 +206,8 @@ def main(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
@@ -218,10 +216,8 @@ def main(
 
 def get_block_template_configs():
     iter_params = dict(
-        block_M=[2, 4, 8, 32, 64, 128],
-        block_N=[2, 4, 8, 32, 64, 128],
-        num_stages=[0, 1, 2, 3, 4],
-        threads=[32, 64, 128, 256])
+        block_M=[2, 4, 8, 32, 64, 128], block_N=[2, 4, 8, 32, 64, 128], num_stages=[0, 1, 2, 3, 4], threads=[32, 64, 128, 256]
+    )
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -237,18 +233,11 @@ def get_block_template_configs():
     },
     out_idx=[2],
 )
-def gemv_alloc_reducer(M,
-                       N,
-                       block_M=128,
-                       block_N=128,
-                       num_stages=2,
-                       threads=256,
-                       dtype: str = "float16",
-                       accum_dtype: str = "float"):
-
+def gemv_alloc_reducer(
+    M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float
+):
     @T.prim_func
-    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M,
-                                                                            dtype)):  # type: ignore
+    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M, dtype)):  # type: ignore
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as i0_m:
             o_reducer = T.alloc_reducer(block_M, accum_dtype, replication="all")
             T.clear(o_reducer)
@@ -287,17 +276,17 @@ def get_autotuned_kernel(
     BLOCK_N=None,
     reduce_threads=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
     BLOCK_K = reduce_threads * TILE_K
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -315,9 +304,9 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -327,21 +316,22 @@ def main(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
     return main
 
 
-def check_correctness_and_bench(kernel, N, K, bench_ref=True):
+def check_correctness_and_bench(kernel, N, K, do_bench=True):
     profiler = kernel.get_profiler()
     profiler.assert_allclose(lambda x, y: x @ y.T, atol=1e-2, rtol=1e-2)
-    if bench_ref:
+    if do_bench:
         latency = profiler.do_bench(lambda x, y: x @ y.T, warmup=50)
         print(f"Torch Latency: {latency} ms")
-    latency = profiler.do_bench(kernel, warmup=50)
-    print(f"TileLang Latency: {latency} ms\n")
+        latency = profiler.do_bench(kernel, warmup=50)
+        print(f"TileLang Latency: {latency} ms\n")
 
 
 def main(do_bench: bool = True):
@@ -350,16 +340,16 @@ def main(do_bench: bool = True):
     parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
     args, _ = parser.parse_known_args()
     N, K = args.n, args.k
-    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K)
-    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K)
-    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K)
+    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K, do_bench=do_bench)
+    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
 
     print("Test passed!")
 
-    if not do_bench:
+    if do_bench:
         best_result = get_autotuned_kernel(N, K)
         best_config = best_result.config
         kernel = splitk_gemv_vectorized_tvm(N, K, **best_config)
@@ -374,5 +364,23 @@ def main(do_bench: bool = True):
         print(f"TileLang BlockReduce Latency: {tilelang_tile_latency} ms\n")
 
 
+def run_regression_perf():
+    N, K = 4096, 4096
+    latency = 0.0
+    kernel_list = [
+        naive_gemv(N, K, 128, 128),
+        naive_splitk_gemv(N, K, 32, 32),
+        splitk_gemv(N, K, 32, 32, 32),
+        splitk_gemv_vectorized(N, K, 2, 32),
+        splitk_gemv_vectorized_tvm(N, K, 2, 32),
+        gemv_alloc_reducer(N, K, block_M=128, block_N=128),
+    ]
+    for kernel in kernel_list:
+        profiler = kernel.get_profiler()
+        # Benchmark the TileLang kernel itself, not the PyTorch reference.
+        latency += profiler.do_bench(backend="cupti")
+    return latency / len(kernel_list)
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemv/regression_example_gemv.py b/examples/gemv/regression_example_gemv.py
new file mode 100644
index 000000000..dd6f1d39f
--- /dev/null
+++ b/examples/gemv/regression_example_gemv.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_gemv
+
+
+def regression_example_gemv():
+    tilelang.testing.process_func(example_gemv.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemv/test_example_gemv.py b/examples/gemv/test_example_gemv.py
index 3881ca769..323337a7a 100644
--- a/examples/gemv/test_example_gemv.py
+++ b/examples/gemv/test_example_gemv.py
@@ -1,5 +1,3 @@
-import tilelang.testing
-
 import example_gemv
 
 
@@ -8,4 +6,4 @@ def test_example_gemv():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    test_example_gemv()
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index ac8da7e2c..49cce0d1d 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -5,78 +5,55 @@
 import tilelang.language as T
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_fwd(batch_sum,
-                     batch_count,
-                     K,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_batch_size = T.alloc_var(dtype=T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
-                cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
 
-            cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            cur_batch_size = batch_sizes[cur_batch_idx]
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx] + batch_offsets[cur_batch_idx]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size + batch_padded_offsets[cur_batch_idx] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx, k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
-                with T.If(i < actual_rows), T.Then():
+                if i < actual_rows:
                     C[m_start + i, by * block_N + j] = C_local[i, j]
 
     return kernel
 
 
 class _GroupedGEMM(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, a, b, batch_sizes):
         block_M = 64
@@ -99,15 +76,11 @@ def forward(ctx, a, b, batch_sizes):
         for i in range(batch_count - 1):
             batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes[i])
         for i in range(batch_count - 1):
-            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                             math.ceil((batch_sizes[i] + 1) / padding_M) *
-                                             padding_M)
+            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes[i] + 1) / padding_M) * padding_M)
         batch_offsets = torch.tensor(batch_offsets_list, device=a.device, dtype=torch.int32)
-        batch_padded_offsets = torch.tensor(
-            batch_padded_offsets_list, device=a.device, dtype=torch.int32)
+        batch_padded_offsets = torch.tensor(batch_padded_offsets_list, device=a.device, dtype=torch.int32)
 
-        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages, threads)
 
         o = kernel(a, b, batch_sizes, batch_offsets, batch_padded_offsets)
         ctx.save_for_backward(a, b, batch_sizes, batch_offsets)
@@ -135,8 +108,7 @@ def maybe_contiguous(x):
             return x
 
         A, B, batch_sizes = [maybe_contiguous(x) for x in (A, B, batch_sizes)]
-        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K, num_stages, threads)
 
         dB = kernel(A, grad_output, batch_sizes, batch_offsets)
         return None, dB, None
@@ -172,9 +144,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i] + 1) / padding_M) *
-                                         padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i] + 1) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -187,40 +157,24 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_bwd(batch_sum,
-                     batch_count,
-                     M,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, M], dtype),  # type: ignore
-            B: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, M], dtype),  # type: ignore
+        B: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count, threads=threads) as (bx, by, bz):
             A_shared = T.alloc_shared([block_K, block_M], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -228,13 +182,9 @@ def kernel(
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(batch_sizes[bz], block_K), num_stages=num_stages):
                 for i, j in T.Parallel(block_K, block_M):
-                    A_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i,
-                                               bx * block_M + j], 0)
+                    A_shared[i, j] = T.if_then_else(i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i, bx * block_M + j], 0)
                 for i, j in T.Parallel(block_K, block_N):
-                    B_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i,
-                                               by * block_N + j], 0)
+                    B_shared[i, j] = T.if_then_else(i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i, by * block_N + j], 0)
                 T.gemm(A_shared, B_shared, C_local, transpose_A=True)
 
             T.copy(C_local, C[bz, bx * block_M, by * block_N])
@@ -242,23 +192,12 @@ def kernel(
     return kernel
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
-
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, False, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, False, padding_M, device, dtype)
 
     A.requires_grad_(False)
     B.requires_grad_(True)
@@ -273,10 +212,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     O.backward(dO, retain_graph=True)
     dB, B.grad = B.grad.clone(), None
 
-    if (
-        torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and \
-        torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2)
-    ):
+    if torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2):
         print("✅ Tilelang and Torch match")
     else:
         print("❌ Tilelang and Torch mismatch")
@@ -284,12 +220,11 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -301,14 +236,4 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 9b58e3a21..b71472741 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -18,8 +18,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
         torch.Tensor: Resulting tensor after grouped matrix multiplication.
     """
     assert a.shape[0] == sum(batch_sizes), "Sum of batch_sizes must equal the first dimension of a"
-    assert b.shape[0] == len(
-        batch_sizes), "The first dimension of b must match the length of batch_sizes"
+    assert b.shape[0] == len(batch_sizes), "The first dimension of b must match the length of batch_sizes"
 
     # Initialize output tensor
     output = torch.empty((sum(batch_sizes), b.shape[2]), device=a.device, dtype=a.dtype)
@@ -38,15 +37,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
 
 
 @tilelang.jit(out_idx=[2])
-def grouped_gemm(batch_sizes_list,
-                 K,
-                 N,
-                 block_M,
-                 block_N,
-                 block_K,
-                 num_stages=2,
-                 threads=128,
-                 dtype="float16"):
+def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -54,50 +45,43 @@ def grouped_gemm(batch_sizes_list,
     """
     batch_sum = sum(batch_sizes_list)
     batch_count = len(batch_sizes_list)
-    accum_dtype = "float32"
+    accum_dtype = T.float32
     total_m_blocks = sum((size + block_M - 1) // block_M for size in batch_sizes_list)
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
         with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_batch_size = T.alloc_var(dtype=T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
-                cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
 
-            cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            cur_batch_size = batch_sizes[cur_batch_idx]
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx] + batch_offsets[cur_batch_idx]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size + batch_padded_offsets[cur_batch_idx] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx, k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
-                with T.If(i < actual_rows), T.Then():
+                if i < actual_rows:
                     C[m_start + i, by * block_N + j] = C_local[i, j]
 
     return kernel
@@ -111,8 +95,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -125,27 +108,16 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     batch_sum = sum(batch_sizes_list)
-    kernel = grouped_gemm(
-        tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
+    kernel = grouped_gemm(tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
     # print(kernel.get_kernel_source())
 
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
     out = kernel(A, B, batch_sizes, batch_offsets, batch_padded_offsets)
     ref_output = torch_gmm(A, B, batch_sizes, batch_offsets, trans_b)
     # print(out)
@@ -157,8 +129,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
     if profile:
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
-        latency = profiler.do_bench(
-            warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
+        latency = profiler.do_bench(warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
         print(f"Latency: {latency} ms")
         print(f"TFlops: {batch_sum * K * M * 2 / latency * 1e-9} TFlops")
 
@@ -173,12 +144,11 @@ def test_grouped_gemm():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -190,14 +160,4 @@ def test_grouped_gemm():
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/hadamard_transform/example_hadamard.py b/examples/hadamard_transform/example_hadamard.py
index 531d46891..65f463b71 100644
--- a/examples/hadamard_transform/example_hadamard.py
+++ b/examples/hadamard_transform/example_hadamard.py
@@ -17,7 +17,7 @@ def is_pow_of_2(n):
 def hadamard(b, n, dtype):
     assert is_pow_of_2(n), "n must be a power of 2"
     assert 2 <= n <= 32768, "n must be in [2, 32768]"
-    elem_size = {'float32': 4, 'float16': 2, 'bfloat16': 2}[dtype]
+    elem_size = {T.float32: 4, T.float16: 2, T.bfloat16: 2}[dtype]
 
     logN = int(math.log2(n))
     threads = [0, 1, 1, 1, 2, 4, 8, 16, 32, 32, 128, 256, 256, 256, 256, 256][logN]
@@ -40,23 +40,21 @@ def hadamard(b, n, dtype):
     # print(f'{exchange_round=}')
 
     @T.macro
-    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype),
-                  round: int):
+    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype), round: int):
         tx = T.get_thread_binding(0)
         for i in T.serial(round):
             tx_stride = 1 << i
             another_tx = tx ^ tx_stride
-            sign = (
-                tx >> i
-            ) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
+            sign = (tx >> i) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
 
             for j in T.Pipelined(thread_elem, num_stages=1):
                 buf[j] = T.tvm_warp_shuffle(
-                    0xffffffff,  # mask of all threads
+                    0xFFFFFFFF,  # mask of all threads
                     local[j],
                     another_tx % warp_size,
                     warp_size,
-                    warp_size)
+                    warp_size,
+                )
                 local[j] = T.if_then_else(sign == 0, local[j] + buf[j], buf[j] - local[j])
 
     @T.prim_func
@@ -78,10 +76,8 @@ def main(A: T.Tensor((b, n), dtype), B: T.Tensor((b, n), dtype)):
                 for j in T.serial(chunknum):
                     chunkbase = j * chunksize
                     for k in T.serial(chunksize // 2):
-                        local[chunkbase +
-                              k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
-                        local[chunkbase + k + chunksize //
-                              2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k + chunksize // 2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
 
             # 3. Hadamard inside warp, n<=512
             # In warp level, we rely on warp shuffle to exchange data inside each warp, without using shared memory
@@ -131,28 +127,27 @@ def ref_program(x: torch.Tensor):
     assert x.ndim == 2
     dim = x.shape[-1]
     assert is_pow_of_2(dim)
-    return F.linear(
-        x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
+    return F.linear(x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='Batch size')
-    parser.add_argument('--dim', type=int, default=32768, help='Dimension')
+    parser.add_argument("--batch", type=int, default=64, help="Batch size")
+    parser.add_argument("--dim", type=int, default=32768, help="Dimension")
     args = parser.parse_args()
 
     B, D = args.batch, args.dim
-    x = torch.randn((B, D), device='cuda')
-    kernel = hadamard(B, D, 'float32')
+    x = torch.randn((B, D), device="cuda")
+    kernel = hadamard(B, D, T.float32)
     y = kernel(x)
     y_ref = ref_program(x)
     torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
-    print('All tests passed.')
+    print("All tests passed.")
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
     latency = profiler.do_bench(warmup=100)
     print("Tile-lang: {:.2f} ms".format(latency))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/kda/FLA_KDA/cumsum.py b/examples/kda/FLA_KDA/cumsum.py
new file mode 100644
index 000000000..0fb3368f6
--- /dev/null
+++ b/examples/kda/FLA_KDA/cumsum.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, autotune_cache_kwargs, input_guard
+
+BS_LIST = [32, 64]
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+    key=["B", "H", "BT", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BS": BS}, num_warps=num_warps) for BS in BS_LIST for num_warps in [2, 4, 8]],
+    key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    else:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    if REVERSE:
+        b_o = tl.cumsum(b_s, axis=0, reverse=True)
+    else:
+        b_o = tl.cumsum(b_s, axis=0)
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps, num_stages=num_stages)
+        for BT in [32, 64, 128, 256]
+        for num_warps in [2, 4, 8]
+        for num_stages in [1, 2, 3, 4]
+    ],
+    key=["B", "H", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_global_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_nh = tl.program_id(0)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    T = eos - bos
+
+    b_z = tl.zeros([], dtype=tl.float32)
+    NT = tl.cdiv(T, BT)
+    for i_c in range(NT):
+        i_t = NT - 1 - i_c if REVERSE else i_c
+        if HEAD_FIRST:
+            p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+            p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        else:
+            p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+        b_o = tl.cumsum(b_s, axis=0)
+        b_ss = tl.sum(b_s, 0)
+        if REVERSE:
+            b_o = -b_o + b_ss + b_s
+        b_o += b_z
+        if i_c >= 0:
+            b_z += b_ss
+        if HAS_SCALE:
+            b_o *= scale
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps, num_stages=num_stages)
+        for BT in [16, 32, 64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [1, 2, 3, 4]
+    ],
+    key=["B", "H", "S", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_global_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    T = eos - bos
+
+    b_z = tl.zeros([BS], dtype=tl.float32)
+    NT = tl.cdiv(T, BT)
+    for i_c in range(NT):
+        i_t = NT - 1 - i_c if REVERSE else i_c
+        if HEAD_FIRST:
+            p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+            p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        else:
+            p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+            p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        # [BT, BS]
+        b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+        if REVERSE:
+            b_c = b_z[None, :] + tl.cumsum(b_s, axis=0, reverse=True)
+        else:
+            b_c = b_z[None, :] + tl.cumsum(b_s, axis=0)
+        if HAS_SCALE:
+            b_c *= scale
+        tl.store(p_o, b_c.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+        b_z += tl.sum(b_s, 0)
+
+
+def chunk_local_cumsum_scalar(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return g
+
+
+def chunk_local_cumsum_vector(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    def grid(meta):
+        return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return g
+
+
+@input_guard
+def chunk_global_cumsum_scalar(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = s.shape
+    else:
+        B, T, H = s.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+
+    z = torch.empty_like(s, dtype=output_dtype or s.dtype)
+    grid = (N * H,)
+    chunk_global_cumsum_scalar_kernel[grid](
+        s=s,
+        o=z,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        T=T,
+        B=B,
+        H=H,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return z
+
+
+@input_guard
+def chunk_global_cumsum_vector(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = s.shape
+    else:
+        B, T, H, S = s.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+    BS = min(32, triton.next_power_of_2(S))
+
+    z = torch.empty_like(s, dtype=output_dtype or s.dtype)
+    grid = (triton.cdiv(S, BS), N * H)
+    chunk_global_cumsum_vector_kernel[grid](
+        s=s,
+        o=z,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BS=BS,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return z
+
+
+@input_guard
+def chunk_global_cumsum(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert s.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(s.shape) == 3:
+        return chunk_global_cumsum_scalar(
+            s=s,
+            reverse=reverse,
+            cu_seqlens=cu_seqlens,
+            scale=scale,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    elif len(s.shape) == 4:
+        return chunk_global_cumsum_vector(
+            s=s,
+            reverse=reverse,
+            cu_seqlens=cu_seqlens,
+            scale=scale,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {s.shape}, "
+            f"which should be [B, T, H]/[B, T, H, D] if `head_first=False` "
+            f"or [B, H, T]/[B, H, T, D] otherwise",
+        )
+
+
+@input_guard
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+    **kwargs,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert g.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+            chunk_indices=chunk_indices,
+        )
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+            chunk_indices=chunk_indices,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {g.shape}, which should be (B, T, H, D) if `head_first=False` or (B, H, T, D) otherwise",
+        )
diff --git a/examples/kda/FLA_KDA/fla_chunk_delta.py b/examples/kda/FLA_KDA/fla_chunk_delta.py
new file mode 100644
index 000000000..3b0fc908d
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_delta.py
@@ -0,0 +1,579 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+from .fla_utils import prepare_chunk_indices, exp, exp2, USE_CUDA_GRAPH, autotune_cache_kwargs
+
+NUM_WARPS = [2, 4]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in [2, 3, 4]
+        for BV in [32, 64]
+    ],
+    key=["H", "K", "V", "BT", "USE_EXP2"],
+    use_cuda_graph=USE_CUDA_GRAPH,
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    gk,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    h += ((boh * H + i_h) * K * V).to(tl.int64)
+    v += ((bos * H + i_h) * V).to(tl.int64)
+    k += ((bos * H + i_h) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    if SAVE_NEW_VALUE:
+        v_new += ((bos * H + i_h) * V).to(tl.int64)
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = H * K
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * K * V
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * K * V
+
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+        p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 0), (BT, 64), (1, 0))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v = tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 64), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 128), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 192), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h4.to(b_w.dtype))
+        p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v
+
+        if SAVE_NEW_VALUE:
+            p_v = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            b_g = tl.load(p_g, boundary_check=(0,))
+            if USE_EXP2:
+                b_v = b_v * tl.where(m_t, exp2(b_g_last - b_g), 0)[:, None]
+                b_g_last = exp2(b_g_last)
+            else:
+                b_v = b_v * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
+                b_g_last = exp(b_g_last)
+            b_h1 *= b_g_last
+            if K > 64:
+                b_h2 *= b_g_last
+            if K > 128:
+                b_h3 *= b_g_last
+            if K > 192:
+                b_h4 *= b_g_last
+
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k1, mask=(o_k1 < K), other=0.0)
+            if USE_EXP2:
+                b_h1 *= exp2(b_gk_last1)[:, None]
+            else:
+                b_h1 *= exp(b_gk_last1)[:, None]
+            if K > 64:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k2, mask=(o_k2 < K), other=0.0)
+                if USE_EXP2:
+                    b_h2 *= exp2(b_gk_last2)[:, None]
+                else:
+                    b_h2 *= exp(b_gk_last2)[:, None]
+            if K > 128:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k3, mask=(o_k3 < K), other=0.0)
+                if USE_EXP2:
+                    b_h3 *= exp2(b_gk_last3)[:, None]
+                else:
+                    b_h3 *= exp(b_gk_last3)[:, None]
+            if K > 192:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k4, mask=(o_k4 < K), other=0.0)
+                if USE_EXP2:
+                    b_h4 *= exp2(b_gk_last4)[:, None]
+                else:
+                    b_h4 *= exp(b_gk_last4)[:, None]
+        b_v = b_v.to(k.dtype.element_ty)
+
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v)
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v)
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v)
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v)
+    # epilogue
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["dh0"] is not None,
+        "USE_FINAL_STATE_GRADIENT": lambda args: args["dht"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in ([4, 3, 2])
+        for BV in [64, 32]
+    ],
+    key=["H", "K", "V", "BT", "BV", "USE_G", "USE_EXP2"],
+    use_cuda_graph=USE_CUDA_GRAPH,
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64(
+    q,
+    k,
+    w,
+    g,
+    gk,
+    dht,
+    dh0,
+    do,
+    dh,
+    dv,
+    dv2,
+    cu_seqlens,
+    chunk_offsets,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_dh1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_dh2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_dh3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_dh4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    q += ((bos * H + i_h) * K).to(tl.int64)
+    k += ((bos * H + i_h) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    do += ((bos * H + i_h) * V).to(tl.int64)
+    dv += ((bos * H + i_h) * V).to(tl.int64)
+    dv2 += ((bos * H + i_h) * V).to(tl.int64)
+    dh += ((boh * H + i_h) * K * V).to(tl.int64)
+    if USE_GK:
+        gk += ((bos * H + i_h) * K).to(tl.int64)
+
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = H * K
+    if USE_INITIAL_STATE:
+        dh0 += i_nh * K * V
+    if USE_FINAL_STATE_GRADIENT:
+        dht += i_nh * K * V
+
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht1 = tl.make_block_ptr(dht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_dh1 += tl.load(p_dht1, boundary_check=(0, 1))
+        if K > 64:
+            p_dht2 = tl.make_block_ptr(dht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_dh2 += tl.load(p_dht2, boundary_check=(0, 1))
+        if K > 128:
+            p_dht3 = tl.make_block_ptr(dht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_dh3 += tl.load(p_dht3, boundary_check=(0, 1))
+        if K > 192:
+            p_dht4 = tl.make_block_ptr(dht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_dh4 += tl.load(p_dht4, boundary_check=(0, 1))
+
+    for i_t in range(NT - 1, -1, -1):
+        p_dh1 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_dh1, b_dh1.to(p_dh1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_dh2 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh2, b_dh2.to(p_dh2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_dh3 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh3, b_dh3.to(p_dh3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_dh4 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh4, b_dh4.to(p_dh4.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            bg_last = tl.load(g + (bos + last_idx) * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            b_g = tl.load(p_g, boundary_check=(0,))
+            if USE_EXP2:
+                bg_last_exp = exp2(bg_last)
+                b_g_exp = exp2(b_g)
+            else:
+                bg_last_exp = exp(bg_last)
+                b_g_exp = exp(b_g)
+
+        p_dv = tl.make_block_ptr(dv, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv2 = tl.make_block_ptr(dv2, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+
+        # Update dv
+        p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 0), (BT, 64), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(gk + last_idx * H * K + o_k1, mask=(o_k1 < K), other=0.0)
+        b_dv = tl.dot(b_k, b_dh1.to(b_k.dtype))
+
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 64), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(gk + last_idx * H * K + o_k2, mask=(o_k2 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh2.to(b_k.dtype))
+
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 128), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(gk + last_idx * H * K + o_k3, mask=(o_k3 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh3.to(b_k.dtype))
+
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 192), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(gk + last_idx * H * K + o_k4, mask=(o_k4 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh4.to(b_k.dtype))
+
+        if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
+            if USE_EXP2:
+                b_dv *= tl.where(m_t, exp2(bg_last - b_g), 0)[:, None]
+            else:
+                b_dv *= tl.where(m_t, exp(bg_last - b_g), 0)[:, None]
+        b_dv += tl.load(p_dv, boundary_check=(0, 1))
+
+        tl.store(p_dv2, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+        # Update dh
+        p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        if USE_G:
+            b_dh1 *= bg_last_exp
+            b_q = b_q * b_g_exp[None, :]
+        if USE_GK:
+            if USE_EXP2:
+                b_dh1 *= exp2(b_gk_last1[:, None])
+            else:
+                b_dh1 *= exp(b_gk_last1[:, None])
+        b_dh1 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 64:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh2 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh2 *= exp2(b_gk_last2[:, None])
+                else:
+                    b_dh2 *= exp(b_gk_last2[:, None])
+            b_dh2 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 128:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh3 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh3 *= exp2(b_gk_last3[:, None])
+                else:
+                    b_dh3 *= exp(b_gk_last3[:, None])
+            b_dh3 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 192:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh4 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh4 *= exp2(b_gk_last4[:, None])
+                else:
+                    b_dh4 *= exp(b_gk_last4[:, None])
+            b_dh4 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(dh0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_dh0, b_dh1.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_dh1 = tl.make_block_ptr(dh0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh1, b_dh2.to(p_dh1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_dh2 = tl.make_block_ptr(dh0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh2, b_dh3.to(p_dh2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_dh3 = tl.make_block_ptr(dh0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh3, b_dh4.to(p_dh3.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    save_new_value: bool = True,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, u.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        gk=gk,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return h, v_new, final_state
+
+
+def chunk_gated_delta_rule_bwd_dhu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    w: torch.Tensor,
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    g: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    h0: torch.Tensor = None,
+    dht: torch.Tensor = None,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *q.shape, do.shape[-1]
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    BT = 64
+    assert K <= 256, "current kernel does not support head dimension being larger than 256."
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+
+    dh = q.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.empty_like(dv)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
+        q=q,
+        k=k,
+        w=w,
+        g=g,
+        gk=gk,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return dh, dh0, dv2
diff --git a/examples/kda/FLA_KDA/fla_chunk_inter.py b/examples/kda/FLA_KDA/fla_chunk_inter.py
new file mode 100644
index 000000000..e6de9bb28
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_inter.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, exp2, autotune_cache_kwargs, check_shared_mem
+
+BK_LIST = [32, 64] if check_shared_mem() else [16, 32]
+BV_LIST = [64, 128] if check_shared_mem("ampere") else [16, 32]
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_bwd_kernel_inter(
+    q,
+    k,
+    v,
+    g,
+    h,
+    do,
+    dh,
+    dq,
+    dk,
+    dv,
+    dw,
+    dg,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_k = o_k < K
+    m_t = o_t < T
+    m_last = o_t == min(T, i_t * BT + BT) - 1
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V
+    g += (bos * H + i_h) * K
+    h += (i_tg * H + i_h) * K * V
+    do += (bos * H + i_h) * V
+    dh += (i_tg * H + i_h) * K * V
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dw += (bos * H + i_h) * K
+    dv += (bos * H + i_h) * V
+    dg += (bos * H + i_h) * K
+
+    p_g = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+    p_gn = g + (min(T, i_t * BT + BT) - 1) * H * K + o_k
+    b_gn = tl.load(p_gn, mask=m_k, other=0)
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dw = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dgk = tl.zeros([BK], dtype=tl.float32)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        # [BK]
+        b_dgk += tl.sum(b_h * b_dh, axis=0)
+        # [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_do.dtype))
+        b_dk += tl.dot(b_v, b_dh.to(b_v.dtype))
+
+        p_dv = tl.make_block_ptr(dv, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_dv = tl.load(p_dv, boundary_check=(0, 1))
+        b_dw += tl.dot(b_dv.to(b_v.dtype), b_h.to(b_v.dtype))
+
+    p_dw = tl.make_block_ptr(dw, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    tl.store(p_dw, -b_dw.to(p_dw.dtype.element_ty), boundary_check=(0, 1))
+
+    b_dgk *= exp2(b_gn)
+    b_dq *= scale
+    b_dq = b_dq * exp2(b_g)
+    b_dk = b_dk * tl.where(m_t[:, None], exp2(b_gn[None, :] - b_g), 0)
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_dgk += tl.sum(b_dk * b_k, axis=0)
+    b_dg = b_q * b_dq - b_k * b_dk + m_last[:, None] * b_dgk
+
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_kda_bwd_dqkwg(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    w: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: torch.Tensor,
+    do: torch.Tensor,
+    dh: torch.Tensor,
+    dv: torch.Tensor,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    dq = torch.empty_like(q, dtype=torch.float)
+    dk = torch.empty_like(k, dtype=torch.float)
+    dw = torch.empty_like(w)
+    dg = torch.empty_like(g)
+
+    def grid(meta):
+        return (triton.cdiv(K, meta["BK"]), NT, B * H)
+
+    chunk_kda_bwd_kernel_inter[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        h=h,
+        do=do,
+        dh=dh,
+        dq=dq,
+        dk=dk,
+        dv=dv,
+        dw=dw,
+        dg=dg,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return dq, dk, dw, dg
diff --git a/examples/kda/FLA_KDA/fla_chunk_intra.py b/examples/kda/FLA_KDA/fla_chunk_intra.py
new file mode 100644
index 000000000..244f05f1c
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_intra.py
@@ -0,0 +1,650 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import autotune_cache_kwargs, exp2, prepare_chunk_indices
+from .cumsum import chunk_local_cumsum
+
+IS_TF32_SUPPORTED = False
+if IS_TF32_SUPPORTED:
+    SOLVE_TRIL_DOT_PRECISION = tl.constexpr("tf32x3")
+else:
+    SOLVE_TRIL_DOT_PRECISION = tl.constexpr("ieee")
+SOLVE_TRIL_DOT_PRECISION = tl.constexpr("tf32")
+# ============================================================================
+# Fused inter + solve_tril kernel: compute off-diagonal Akk and solve in one pass
+# ============================================================================
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BK": BK}, num_warps=num_warps) for BK in [32, 64] for num_warps in [1, 2, 4]],
+    key=["H", "K", "BC"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_fwd_kernel_inter_solve_fused(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk_diag,
+    Akk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    """
+    Fused kernel: compute inter-subchunk Akk + solve_tril in one pass.
+    Prerequisite: token_parallel has already computed diagonal Akk blocks in Akk_diag.
+
+    This kernel:
+    1. Computes off-diagonal Aqk blocks -> writes to global
+    2. Computes off-diagonal Akk blocks -> keeps in registers
+    3. Loads diagonal Akk blocks from Akk_diag (fp32)
+    4. Does forward substitution on diagonals
+    5. Computes merged Akk_inv
+    6. Writes Akk_inv to Akk
+    """
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT >= T:
+        return
+
+    i_tc0 = i_t * BT
+    i_tc1 = i_t * BT + BC
+    i_tc2 = i_t * BT + 2 * BC
+    i_tc3 = i_t * BT + 3 * BC
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    Aqk += (bos * H + i_h) * BT
+    Akk += (bos * H + i_h) * BT
+    Akk_diag += (bos * H + i_h) * BC
+
+    m_tc1 = (i_tc1 + tl.arange(0, BC)) < T
+    m_tc2 = (i_tc2 + tl.arange(0, BC)) < T
+    m_tc3 = (i_tc3 + tl.arange(0, BC)) < T
+
+    b_Aqk10 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk10 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    b_Aqk20 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk20 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk21 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk21 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    b_Aqk30 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk30 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk31 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk31 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk32 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk32 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    ################################################################################
+    # 1. off-diagonal blocks
+    ################################################################################
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+
+        p_k0 = tl.make_block_ptr(k, (K, T), (1, H * K), (i_k * BK, i_tc0), (BK, BC), (0, 1))
+        p_g0 = tl.make_block_ptr(g, (K, T), (1, H * K), (i_k * BK, i_tc0), (BK, BC), (0, 1))
+        b_kt0 = tl.load(p_k0, boundary_check=(0, 1)).to(tl.float32)
+        b_gt0 = tl.load(p_g0, boundary_check=(0, 1)).to(tl.float32)
+
+        b_kt1, b_gt1 = b_kt0, b_gt0
+        b_kt2, b_gt2 = b_kt0, b_gt0
+        if i_tc1 < T:
+            p_q1 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+            p_k1 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+            p_g1 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+
+            b_q1 = tl.load(p_q1, boundary_check=(0, 1)).to(tl.float32)
+            b_k1 = tl.load(p_k1, boundary_check=(0, 1)).to(tl.float32)
+            b_g1 = tl.load(p_g1, boundary_check=(0, 1)).to(tl.float32)
+            b_kt1 = tl.trans(b_k1)
+            b_gt1 = tl.trans(b_g1)
+
+            b_gn1 = tl.load(g + i_tc1 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn1 = tl.where(m_tc1[:, None], exp2(b_g1 - b_gn1[None, :]), 0)
+            b_qg1 = b_q1 * b_gqn1
+            b_kg1 = b_k1 * b_gqn1
+            b_kgt = b_kt0 * exp2(b_gn1[:, None] - b_gt0)
+            b_Aqk10 += tl.dot(b_qg1, b_kgt)
+            b_Akk10 += tl.dot(b_kg1, b_kgt)
+
+        if i_tc2 < T:
+            p_q2 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+            p_k2 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+            p_g2 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+
+            b_q2 = tl.load(p_q2, boundary_check=(0, 1)).to(tl.float32)
+            b_k2 = tl.load(p_k2, boundary_check=(0, 1)).to(tl.float32)
+            b_g2 = tl.load(p_g2, boundary_check=(0, 1)).to(tl.float32)
+            b_kt2 = tl.trans(b_k2)
+            b_gt2 = tl.trans(b_g2)
+
+            b_gn2 = tl.load(g + i_tc2 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn2 = tl.where(m_tc2[:, None], exp2(b_g2 - b_gn2[None, :]), 0)
+            b_qg2 = b_q2 * b_gqn2
+            b_kg2 = b_k2 * b_gqn2
+            b_kgt = b_kt0 * exp2(b_gn2[:, None] - b_gt0)
+            b_Aqk20 += tl.dot(b_qg2, b_kgt)
+            b_Akk20 += tl.dot(b_kg2, b_kgt)
+
+            b_kgt = b_kt1 * exp2(b_gn2[:, None] - b_gt1)
+            b_Aqk21 += tl.dot(b_qg2, b_kgt)
+            b_Akk21 += tl.dot(b_kg2, b_kgt)
+
+        if i_tc3 < T:
+            p_q3 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            p_k3 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            p_g3 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            b_q3 = tl.load(p_q3, boundary_check=(0, 1)).to(tl.float32)
+            b_k3 = tl.load(p_k3, boundary_check=(0, 1)).to(tl.float32)
+            b_g3 = tl.load(p_g3, boundary_check=(0, 1)).to(tl.float32)
+
+            b_gn3 = tl.load(g + i_tc3 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn3 = tl.where(m_tc3[:, None], exp2(b_g3 - b_gn3[None, :]), 0)
+            b_qg3 = b_q3 * b_gqn3
+            b_kg3 = b_k3 * b_gqn3
+            b_kgt = b_kt0 * exp2(b_gn3[:, None] - b_gt0)
+            b_Aqk30 += tl.dot(b_qg3, b_kgt)
+            b_Akk30 += tl.dot(b_kg3, b_kgt)
+
+            b_kgt = b_kt1 * exp2(b_gn3[:, None] - b_gt1)
+            b_Aqk31 += tl.dot(b_qg3, b_kgt)
+            b_Akk31 += tl.dot(b_kg3, b_kgt)
+
+            b_kgt = b_kt2 * exp2(b_gn3[:, None] - b_gt2)
+            b_Aqk32 += tl.dot(b_qg3, b_kgt)
+            b_Akk32 += tl.dot(b_kg3, b_kgt)
+
+    ################################################################################
+    # 2. save off-diagonal Aqk blocks and prepare Akk
+    ################################################################################
+    if i_tc1 < T:
+        p_Aqk10 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc1, 0), (BC, BC), (1, 0))
+        tl.store(p_Aqk10, (b_Aqk10 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b1 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc1,), (BC,), (0,))
+        b_b1 = tl.load(p_b1, boundary_check=(0,)).to(tl.float32)
+        b_Akk10 = b_Akk10 * b_b1[:, None]
+    if i_tc2 < T:
+        p_Aqk20 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc2, 0), (BC, BC), (1, 0))
+        p_Aqk21 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc2, BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk20, (b_Aqk20 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk21, (b_Aqk21 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b2 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc2,), (BC,), (0,))
+        b_b2 = tl.load(p_b2, boundary_check=(0,)).to(tl.float32)
+        b_Akk20 = b_Akk20 * b_b2[:, None]
+        b_Akk21 = b_Akk21 * b_b2[:, None]
+    if i_tc3 < T:
+        p_Aqk30 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, 0), (BC, BC), (1, 0))
+        p_Aqk31 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, BC), (BC, BC), (1, 0))
+        p_Aqk32 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, 2 * BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk30, (b_Aqk30 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk31, (b_Aqk31 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk32, (b_Aqk32 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b3 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc3,), (BC,), (0,))
+        b_b3 = tl.load(p_b3, boundary_check=(0,)).to(tl.float32)
+        b_Akk30 = b_Akk30 * b_b3[:, None]
+        b_Akk31 = b_Akk31 * b_b3[:, None]
+        b_Akk32 = b_Akk32 * b_b3[:, None]
+
+    ################################################################################
+    # 3. load diagonal Akk blocks
+    ################################################################################
+    p_Akk00 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc0, 0), (BC, BC), (1, 0))
+    p_Akk11 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc1, 0), (BC, BC), (1, 0))
+    p_Akk22 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc2, 0), (BC, BC), (1, 0))
+    p_Akk33 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc3, 0), (BC, BC), (1, 0))
+    # each diagonal block is stored contiguously: row i of block s is at Akk_diag[t=i_t*BT+s*BC+i, :BC]
+    b_Ai00 = tl.load(p_Akk00, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai11 = tl.load(p_Akk11, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai22 = tl.load(p_Akk22, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai33 = tl.load(p_Akk33, boundary_check=(0, 1)).to(tl.float32)
+
+    ################################################################################
+    # 4. forward substitution on diagonals
+    ################################################################################
+    o_i = tl.arange(0, BC)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+
+    b_Ai00 = -tl.where(m_A, b_Ai00, 0)
+    b_Ai11 = -tl.where(m_A, b_Ai11, 0)
+    b_Ai22 = -tl.where(m_A, b_Ai22, 0)
+    b_Ai33 = -tl.where(m_A, b_Ai33, 0)
+
+    # Forward substitution: load from Akk_diag (stride H*BC, columns 0:BC)
+    for i in range(2, min(BC, T - i_tc0)):
+        b_a00 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a00 = tl.where(o_i < i, b_a00, 0.0)
+        b_a00 += tl.sum(b_a00[:, None] * b_Ai00, 0)
+        b_Ai00 = tl.where((o_i == i)[:, None], b_a00, b_Ai00)
+    for i in range(BC + 2, min(2 * BC, T - i_tc0)):
+        b_a11 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a11 = tl.where(o_i < i - BC, b_a11, 0.0)
+        b_a11 += tl.sum(b_a11[:, None] * b_Ai11, 0)
+        b_Ai11 = tl.where((o_i == i - BC)[:, None], b_a11, b_Ai11)
+    for i in range(2 * BC + 2, min(3 * BC, T - i_tc0)):
+        b_a22 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a22 = tl.where(o_i < i - 2 * BC, b_a22, 0.0)
+        b_a22 += tl.sum(b_a22[:, None] * b_Ai22, 0)
+        b_Ai22 = tl.where((o_i == i - 2 * BC)[:, None], b_a22, b_Ai22)
+    for i in range(3 * BC + 2, min(4 * BC, T - i_tc0)):
+        b_a33 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a33 = tl.where(o_i < i - 3 * BC, b_a33, 0.0)
+        b_a33 += tl.sum(b_a33[:, None] * b_Ai33, 0)
+        b_Ai33 = tl.where((o_i == i - 3 * BC)[:, None], b_a33, b_Ai33)
+
+    b_Ai00 += m_I
+    b_Ai11 += m_I
+    b_Ai22 += m_I
+    b_Ai33 += m_I
+
+    # ################################################################################
+    # # 5. compute merged inverse using off-diagonals
+    # ################################################################################
+
+    # we used tf32x3 to maintain matrix inverse's precision whenever possible.
+    b_Ai10 = -tl.dot(tl.dot(b_Ai11, b_Akk10, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+    b_Ai21 = -tl.dot(tl.dot(b_Ai22, b_Akk21, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai11, input_precision=SOLVE_TRIL_DOT_PRECISION)
+    b_Ai32 = -tl.dot(tl.dot(b_Ai33, b_Akk32, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai22, input_precision=SOLVE_TRIL_DOT_PRECISION)
+
+    b_Ai20 = -tl.dot(
+        b_Ai22,
+        tl.dot(b_Akk20, b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk21, b_Ai10, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+    b_Ai31 = -tl.dot(
+        b_Ai33,
+        tl.dot(b_Akk31, b_Ai11, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk32, b_Ai21, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+    b_Ai30 = -tl.dot(
+        b_Ai33,
+        tl.dot(b_Akk30, b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk31, b_Ai10, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk32, b_Ai20, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+
+    ################################################################################
+    # 6. store full Akk_inv to Akk
+    ################################################################################
+
+    p_Akk00 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc0, 0), (BC, BC), (1, 0))
+    p_Akk10 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc1, 0), (BC, BC), (1, 0))
+    p_Akk11 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc1, BC), (BC, BC), (1, 0))
+    p_Akk20 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, 0), (BC, BC), (1, 0))
+    p_Akk21 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, BC), (BC, BC), (1, 0))
+    p_Akk22 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, 2 * BC), (BC, BC), (1, 0))
+    p_Akk30 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 0), (BC, BC), (1, 0))
+    p_Akk31 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, BC), (BC, BC), (1, 0))
+    p_Akk32 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 2 * BC), (BC, BC), (1, 0))
+    p_Akk33 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 3 * BC), (BC, BC), (1, 0))
+
+    tl.store(p_Akk00, b_Ai00.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk10, b_Ai10.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk11, b_Ai11.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk20, b_Ai20.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk21, b_Ai21.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk22, b_Ai22.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk30, b_Ai30.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk31, b_Ai31.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk32, b_Ai32.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk33, b_Ai33.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4]],
+    key=["BK", "NC", "BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["B", "T"])
+def chunk_kda_bwd_kernel_intra(
+    q,
+    k,
+    g,
+    beta,
+    dAqk,
+    dAkk,
+    dq,
+    dq2,
+    dk,
+    dk2,
+    dg,
+    dg2,
+    db,
+    cu_seqlens,
+    chunk_indices,
+    B,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_kc, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_k, i_i = i_kc // NC, i_kc % NC
+
+    all = B * T
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    T = eos - bos
+
+    i_ti = i_t * BT + i_i * BC
+    if i_ti >= T:
+        return
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    m_k = o_k < K
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    beta += bos * H + i_h
+
+    dAqk += (bos * H + i_h) * BT
+    dAkk += (bos * H + i_h) * BT
+    dq += (bos * H + i_h) * K
+    dq2 += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dk2 += (bos * H + i_h) * K
+    dg += (bos * H + i_h) * K
+    dg2 += (bos * H + i_h) * K
+    db += (i_k * all + bos) * H + i_h
+
+    p_g = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+
+    p_b = tl.make_block_ptr(beta, (T,), (H,), (i_ti,), (BC,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    b_dq2 = tl.zeros([BC, BK], dtype=tl.float32)
+    b_dk2 = tl.zeros([BC, BK], dtype=tl.float32)
+    if i_i > 0:
+        p_gn = g + i_ti * H * K + o_k
+        # [BK,]
+        b_gn = tl.load(p_gn, mask=m_k, other=0)
+        for i_j in range(0, i_i):
+            p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_dAqk = tl.make_block_ptr(dAqk, (T, BT), (H * BT, 1), (i_ti, i_j * BC), (BC, BC), (1, 0))
+            p_dAkk = tl.make_block_ptr(dAkk, (T, BT), (H * BT, 1), (i_ti, i_j * BC), (BC, BC), (1, 0))
+            # [BC, BK]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_gk = tl.load(p_gk, boundary_check=(0, 1))
+            b_kg = b_k * exp2(b_gn[None, :] - b_gk)
+            # [BC, BC]
+            b_dAqk = tl.load(p_dAqk, boundary_check=(0, 1))
+            b_dAkk = tl.load(p_dAkk, boundary_check=(0, 1))
+            # [BC, BK]
+            b_dq2 += tl.dot(b_dAqk, b_kg)
+            b_dk2 += tl.dot(b_dAkk, b_kg)
+        b_gqn = exp2(b_g - b_gn[None, :])
+        b_dq2 *= b_gqn
+        b_dk2 *= b_gqn
+
+    o_i = tl.arange(0, BC)
+    m_dA = (i_ti + o_i) < T
+    o_dA = (i_ti + o_i) * H * BT + i_i * BC
+    p_kj = k + i_ti * H * K + o_k
+    p_gkj = g + i_ti * H * K + o_k
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # [BC]
+        b_dAqk = tl.load(dAqk + o_dA + j, mask=m_dA, other=0)
+        b_dAkk = tl.load(dAkk + o_dA + j, mask=m_dA, other=0)
+        # [BK]
+        b_kj = tl.load(p_kj, mask=m_k, other=0).to(tl.float32)
+        b_gkj = tl.load(p_gkj, mask=m_k, other=0).to(tl.float32)
+        # [BC, BK]
+        m_i = o_i[:, None] >= j
+        # [BC, BK]
+        b_kgj = b_kj[None, :] * exp2(b_g - b_gkj[None, :])
+        b_dq2 += tl.where(m_i, b_dAqk[:, None] * b_kgj, 0.0)
+        b_dk2 += tl.where(m_i, b_dAkk[:, None] * b_kgj, 0.0)
+
+        p_kj += H * K
+        p_gkj += H * K
+    b_db = tl.sum(b_dk2 * b_k, 1)
+    b_dk2 *= b_b[:, None]
+
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dq2 = tl.make_block_ptr(dq2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_db = tl.make_block_ptr(db, (T,), (H,), (i_ti,), (BC,), (0,))
+
+    b_dg2 = b_q * b_dq2
+    b_dq2 = b_dq2 + tl.load(p_dq, boundary_check=(0, 1))
+    tl.store(p_dq2, b_dq2.to(p_dq2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0,))
+
+    tl.debug_barrier()
+    b_dkt = tl.zeros([BC, BK], dtype=tl.float32)
+
+    NC = min(NC, tl.cdiv(T - i_t * BT, BC))
+    if i_i < NC - 1:
+        p_gn = g + (min(i_ti + BC, T) - 1) * H * K + o_k
+        # [BK,]
+        b_gn = tl.load(p_gn, mask=m_k, other=0)
+        for i_j in range(i_i + 1, NC):
+            p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_b = tl.make_block_ptr(beta, (T,), (H,), (i_t * BT + i_j * BC,), (BC,), (0,))
+            p_dAqk = tl.make_block_ptr(dAqk, (BT, T), (1, H * BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1))
+            p_dAkk = tl.make_block_ptr(dAkk, (BT, T), (1, H * BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1))
+            # [BC]
+            b_b = tl.load(p_b, boundary_check=(0,))
+            # [BC, BK]
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_kb = tl.load(p_k, boundary_check=(0, 1)) * b_b[:, None]
+            b_gk = tl.load(p_gk, boundary_check=(0, 1))
+            # [BC, BC]
+            b_dAqk = tl.load(p_dAqk, boundary_check=(0, 1))
+            b_dAkk = tl.load(p_dAkk, boundary_check=(0, 1))
+
+            o_j = i_t * BT + i_j * BC + o_i
+            m_j = o_j < T
+            # [BC, BK]
+            b_gkn = tl.where(m_j[:, None], exp2(b_gk - b_gn[None, :]), 0)
+            b_qg = b_q * b_gkn
+            b_kbg = b_kb * b_gkn
+            # [BC, BK]
+            b_dkt += tl.dot(b_dAqk, b_qg) + tl.dot(b_dAkk, b_kbg)
+        b_dkt *= exp2(b_gn[None, :] - b_g)
+
+    o_dA = i_ti * H * BT + i_i * BC + o_i
+    p_qj = q + i_ti * H * K + o_k  # [bs, i_ti, i_h*block_h, i_k*bk:(i_k+1)*bk]
+    p_kj = k + i_ti * H * K + o_k
+    p_gkj = g + i_ti * H * K + o_k
+    p_bj = beta + i_ti * H
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # [BC,]
+        b_dAqk = tl.load(dAqk + o_dA + j * H * BT)
+        b_dAkk = tl.load(dAkk + o_dA + j * H * BT)
+        # [BK,]
+        b_qj = tl.load(p_qj, mask=m_k, other=0).to(tl.float32)
+        b_kbj = tl.load(p_kj, mask=m_k, other=0).to(tl.float32) * tl.load(p_bj)
+        b_gkj = tl.load(p_gkj, mask=m_k, other=0).to(tl.float32)
+        # [BC, BK]
+        m_i = o_i[:, None] <= j
+        b_gkq = exp2(b_gkj[None, :] - b_g)
+        b_dkt += tl.where(m_i, (b_dAkk[:, None] * b_kbj[None, :] + b_dAqk[:, None] * b_qj[None, :]) * b_gkq, 0.0)
+
+        p_qj += H * K
+        p_kj += H * K
+        p_gkj += H * K
+        p_bj += H
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dk2 = tl.make_block_ptr(dk2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dg2 = tl.make_block_ptr(dg2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+
+    b_dg2 += (b_dk2 - b_dkt) * b_k + tl.load(p_dg, boundary_check=(0, 1))
+    b_dk2 += tl.load(p_dk, boundary_check=(0, 1))
+    b_dk2 += b_dkt
+
+    tl.store(p_dk2, b_dk2.to(p_dk2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg2, b_dg2.to(p_dg2.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_kda_bwd_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    dAqk: torch.Tensor,
+    dAkk: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    db: torch.Tensor,
+    dg: torch.Tensor,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+    chunk_size: int = 64,
+):
+    B, T, H, K = k.shape
+    BT = chunk_size
+    BC = min(16, BT)
+    BK = min(32, triton.next_power_of_2(K))
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    NC = triton.cdiv(BT, BC)
+    NK = triton.cdiv(K, BK)
+
+    dq2 = torch.empty_like(q)
+    dk2 = torch.empty_like(k)
+    db2 = beta.new_empty(NK, *beta.shape, dtype=torch.float)
+    dg2 = torch.empty_like(dg, dtype=torch.float)
+    grid = (NK * NC, NT, B * H)
+    chunk_kda_bwd_kernel_intra[grid](
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dq2=dq2,
+        dk=dk,
+        dk2=dk2,
+        dg=dg,
+        dg2=dg2,
+        db=db2,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        B=B,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        NC=NC,
+    )
+    dq = dq2
+    dk = dk2
+    db = db2.sum(0).add_(db)
+    dg = chunk_local_cumsum(
+        dg2,
+        chunk_size=chunk_size,
+        reverse=True,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+    )
+
+    return dq, dk, db, dg
+
+
+def chunk_kda_fwd_inter_solve_fused(
+    q,
+    k,
+    gk,
+    beta,
+    Aqk,
+    Akk_diag,
+    Akk,
+    scale,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, K = k.shape
+    assert K <= 256
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    BC = 16
+
+    grid = (NT, B * H)
+    chunk_kda_fwd_kernel_inter_solve_fused[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        Aqk=Aqk,
+        Akk_diag=Akk_diag,
+        Akk=Akk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+    )
diff --git a/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py b/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py
new file mode 100644
index 000000000..1dba20282
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# Token-parallel implementation of KDA intra chunk kernel
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import exp2, autotune_cache_kwargs
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BH": BH}, num_warps=num_warps) for BH in [1, 2, 4, 8] for num_warps in [1, 2, 4, 8]],
+    key=["K", "H"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T", "N"])
+def chunk_kda_fwd_kernel_intra_token_parallel(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk,
+    scale,
+    cu_seqlens,
+    N,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BH: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_tg, i_hg = tl.program_id(0), tl.program_id(1)
+
+    if IS_VARLEN:
+        i_n = 0
+        left, right = 0, N
+
+        # Unrolled binary search (max B=2^32)
+        # We can limit iterations based on expected max batch size if needed
+        # 20 iterations covers B=1M, usually enough
+        for _ in range(20):
+            if left < right:
+                mid = (left + right) // 2
+                if i_tg < tl.load(cu_seqlens + mid + 1).to(tl.int32):
+                    right = mid
+                else:
+                    left = mid + 1
+        i_n = left
+
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        i_t = i_tg - bos
+    else:
+        bos = (i_tg // T) * T
+        i_t = i_tg % T
+
+    if i_t >= T:
+        return
+
+    i_c = i_t // BT  # chunk indices
+    i_s = (i_t % BT) // BC  # sub_chunk indices
+    i_tc = i_c * BT  # chunk 首坐标
+    i_ts = i_tc + i_s * BC  # subchunk 首坐标
+
+    q += bos * H * K
+    k += bos * H * K
+    g += bos * H * K
+    Aqk += bos * H * BT
+    Akk += bos * H * BC
+    beta += bos * H
+
+    BK: tl.constexpr = triton.next_power_of_2(K)
+    o_h = tl.arange(0, BH)
+    o_k = tl.arange(0, BK)
+    m_h = (i_hg * BH + o_h) < H
+    m_k = o_k < K
+
+    p_q = tl.make_block_ptr(q + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_k = tl.make_block_ptr(k + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_g = tl.make_block_ptr(g + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_beta = tl.make_block_ptr(beta + i_t * H, (H,), (1,), (i_hg * BH,), (BH,), (0,))
+    # [BH, BK]
+    b_q = tl.load(p_q, boundary_check=(0, 1)).to(tl.float32)
+    b_k = tl.load(p_k, boundary_check=(0, 1)).to(tl.float32)
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)
+    b_k = b_k * tl.load(p_beta, boundary_check=(0,)).to(tl.float32)[:, None]
+
+    for j in range(i_ts, min(i_t + 1, min(T, i_ts + BC))):
+        p_kj = tl.make_block_ptr(k + j * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+        p_gj = tl.make_block_ptr(g + j * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+        # [BH, BK]
+        b_kj = tl.load(p_kj, boundary_check=(0, 1)).to(tl.float32)
+        b_gj = tl.load(p_gj, boundary_check=(0, 1)).to(tl.float32)
+
+        b_kgj = b_kj * exp2(b_g - b_gj)
+
+        b_kgj = tl.where(m_k[None, :], b_kgj, 0.0)
+        # [BH]
+        b_Aqk = tl.sum(b_q * b_kgj, axis=1) * scale
+        b_Akk = tl.sum(b_k * b_kgj, axis=1) * tl.where(j < i_t, 1.0, 0.0)
+
+        tl.store(Aqk + i_t * H * BT + (i_hg * BH + o_h) * BT + j % BT, b_Aqk.to(Aqk.dtype.element_ty), mask=m_h)
+        tl.store(Akk + i_t * H * BC + (i_hg * BH + o_h) * BC + j - i_ts, b_Akk.to(Akk.dtype.element_ty), mask=m_h)
+
+
+def chunk_kda_fwd_intra_token_parallel(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    gk: torch.Tensor,
+    beta: torch.Tensor,
+    Aqk: torch.Tensor,
+    Akk: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    sub_chunk_size: int = 16,
+) -> None:
+    """
+    Token-parallel implementation: each token gets its own thread block.
+    Supports both fixed-length and variable-length sequences.
+    Reduces wasted computation on padding.
+
+    Writes directly to Aqk and Akk tensors (in-place).
+
+    Args:
+        q: [B, T, H, K]
+        k: [B, T, H, K]
+        gk: [B, T, H, K] cumsum of gates
+        beta: [B, T, H]
+        Aqk: [B, T, H, BT] output tensor to write to
+        Akk: [B, T, H, BC] output tensor for diagonal blocks (fp32)
+        scale: attention scale
+        chunk_size: BT (default 64)
+        sub_chunk_size: BC (default 16)
+    """
+    B, T, H, K = q.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+    BT = chunk_size
+    BC = sub_chunk_size
+
+    def grid(meta):
+        return (B * T, triton.cdiv(H, meta["BH"]))
+
+    chunk_kda_fwd_kernel_intra_token_parallel[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        Aqk=Aqk,
+        Akk=Akk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        N=N,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+    )
+    return Aqk, Akk
diff --git a/examples/kda/FLA_KDA/fla_chunk_o.py b/examples/kda/FLA_KDA/fla_chunk_o.py
new file mode 100644
index 000000000..c29db9508
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_o.py
@@ -0,0 +1,546 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+
+from .fla_utils import prepare_chunk_indices, exp, exp2, autotune_cache_kwargs, check_shared_mem
+
+
+BK_LIST = [32, 64] if check_shared_mem() else [16, 32]
+BV_LIST = [64, 128] if check_shared_mem("ampere") else [16, 32]
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for BV in [64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_fwd_kernel_o(
+    q,
+    v,
+    g,
+    h,
+    o,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_g = tl.make_block_ptr(g + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BT, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        # [BT, BK]
+        if USE_EXP2:
+            b_qg = (b_q * exp2(b_g)).to(b_q.dtype)
+        else:
+            b_qg = (b_q * exp(b_g)).to(b_q.dtype)
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # works but dkw, owing to divine benevolence
+        # [BT, BV]
+        if i_k >= 0:
+            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    # [BT, BV]
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_A = tl.where(m_s, b_A, 0.0).to(b_v.dtype)
+    b_o += tl.dot(b_A, b_v)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_dv(
+    k,
+    g,
+    A,
+    do,
+    dh,
+    dv,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+    p_do = tl.make_block_ptr(do + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_do = tl.load(p_do, boundary_check=(0, 1))
+
+    b_A = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], b_A, 0.0)
+    # (SY 09/17) important to disallow tf32 here to maintain a good precision.
+    b_dv = tl.dot(b_A, b_do.to(b_A.dtype), allow_tf32=False)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_gk = tl.make_block_ptr(g + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_gn = g + (bos + min(i_t * BT + BT, T) - 1) * H * K + i_h * K + o_k
+        p_dh = tl.make_block_ptr(dh + (i_tg * H + i_h) * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        b_gn = exp(tl.load(p_gn, mask=m_k, other=0)[None, :] - b_gk)
+        b_k = (b_k * b_gn).to(b_k.dtype)
+        # [BT, BV]
+        # (SY 09/17) it is ok to have bf16 interchunk gradient contribution here
+        b_dv += tl.dot(b_k, b_dh.to(b_k.dtype))
+
+    tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps) for BK in BK_LIST for BV in BV_LIST for num_warps in [2, 4, 8]],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_inter(
+    q,
+    k,
+    v,
+    g,
+    h,
+    do,
+    dh,
+    dq,
+    dk,
+    dq2,
+    dk2,
+    dg,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    o_k = i_k * BK + tl.arange(0, BK)
+    m_k = o_k < K
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V
+    g += (bos * H + i_h) * K
+    h += (i_tg * H + i_h) * K * V
+    do += (bos * H + i_h) * V
+    dh += (i_tg * H + i_h) * K * V
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dq2 += (bos * H + i_h) * K
+    dk2 += (bos * H + i_h) * K
+    dg += (bos * H + i_h) * K
+
+    p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_gk = tl.load(p_gk, boundary_check=(0, 1))
+    p_gn = g + (min(T, i_t * BT + BT) - 1) * H * K + o_k
+    b_gn = tl.load(p_gn, mask=m_k, other=0)
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dgk = tl.zeros([BK], dtype=tl.float32)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        # [BK]
+        b_dgk += tl.sum(b_h * b_dh, axis=0)
+        # [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_do.dtype))
+        b_dk += tl.dot(b_v, b_dh.to(b_v.dtype))
+
+    b_dgk *= exp(b_gn)
+    b_dq *= scale
+    b_dq = b_dq * exp(b_gk)
+    b_dk = b_dk * exp(b_gn[None, :] - b_gk)
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_dgk += tl.sum(b_dk * b_k, axis=0)
+    b_dq += tl.load(p_dq, boundary_check=(0, 1))
+    b_dk += tl.load(p_dk, boundary_check=(0, 1))
+    b_dg = b_q * b_dq - b_k * b_dk
+    # tl.debug_barrier()
+    b_dg = b_dg - tl.cumsum(b_dg, axis=0) + tl.sum(b_dg, axis=0)[None, :] + b_dgk[None, :]
+    # Buggy due to strange triton compiler issue.
+    # m_s = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], 1., 0.)
+    # b_dg = tl.dot(m_s, b_dg, allow_tf32=False) + b_dgk[None, :]
+    p_dq = tl.make_block_ptr(dq2, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk2, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_fwd_o_gk(
+    q: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    A: torch.Tensor,
+    h: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+):
+    B, T, H, K, V = *q.shape, v.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    o = torch.empty_like(v)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_gla_fwd_kernel_o[grid](
+        q=q,
+        v=v,
+        g=g,
+        h=h,
+        o=o,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return o
+
+
+NUM_WARPS = [2, 4]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_G_GAMMA": lambda args: args["g_gamma"] is not None,
+        "USE_A": lambda args: args["A"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in NUM_WARPS for num_stages in [2, 3, 4]],
+    key=["H", "K", "V", "BT", "BK", "BV", "USE_G"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_bwd_kernel_dv_local(
+    q,
+    k,
+    g,
+    g_gamma,
+    A,
+    do,
+    dv,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_G_GAMMA: tl.constexpr,
+    USE_A: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    do += (bos * H + i_h) * V
+    dv += (bos * H + i_h) * V
+
+    if USE_A:
+        p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+        b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] <= o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0).to(do.dtype.element_ty)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        b_dv = tl.dot(b_A.to(b_do.dtype), b_do)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_bwd_dv_local(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    do: torch.Tensor,
+    g: torch.Tensor = None,
+    g_gamma: torch.Tensor = None,
+    A: torch.Tensor = None,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    B, T, H, K, V = *k.shape, do.shape[-1]
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    # H100 can have larger block size
+    if check_shared_mem("hopper", k.device.index):
+        CONST_TILING = 128
+    elif check_shared_mem:
+        CONST_TILING = 64
+    else:
+        CONST_TILING = 32
+    BK = min(max(triton.next_power_of_2(K), 16), CONST_TILING)
+    BV = min(max(triton.next_power_of_2(V), 16), CONST_TILING)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    dv = torch.empty_like(do)
+    grid = (NT, B * H)
+    chunk_bwd_kernel_dv_local[grid](
+        q=q,
+        k=k,
+        g=g,
+        g_gamma=g_gamma,
+        A=A,
+        do=do,
+        dv=dv,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return dv
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4]],
+    key=["BV", "BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_dA(
+    v,
+    do,
+    dA,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    T = eos - bos
+
+    b_dA = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_do = tl.make_block_ptr(do + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (V, T), (1, H * V), (i_v * BV, i_t * BT), (BV, BT), (0, 1))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+
+        b_dA += tl.dot(b_do, b_v)
+
+    p_dA = tl.make_block_ptr(dA + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+    b_dA = tl.where(m_s, b_dA * scale, 0.0)
+    tl.store(p_dA, b_dA.to(p_dA.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_bwd_dA(
+    v: torch.Tensor,
+    do: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, V = v.shape
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BV = min(64, triton.next_power_of_2(V))
+
+    dA = v.new_empty(B, T, H, BT, dtype=torch.float32)
+    grid = (NT, B * H)
+    chunk_gla_bwd_kernel_dA[grid](
+        v=v,
+        do=do,
+        dA=dA,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        V=V,
+        BT=BT,
+        BV=BV,
+    )
+    return dA
diff --git a/examples/kda/FLA_KDA/fla_utils.py b/examples/kda/FLA_KDA/fla_utils.py
new file mode 100644
index 000000000..b278aec90
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_utils.py
@@ -0,0 +1,240 @@
+import contextlib
+import functools
+import inspect
+import os
+import warnings
+from collections.abc import Callable
+from typing import Any
+from packaging import version
+from enum import Enum
+
+import torch
+import triton
+import triton.language.extra.libdevice as tldevice
+
+
+device = "cuda"
+device_torch_lib = getattr(torch, device)
+
+exp = tldevice.fast_expf
+exp2 = tldevice.exp2
+log = tldevice.fast_logf
+log2 = tldevice.fast_log2f
+
+IS_NVIDIA_HOPPER = True and ("NVIDIA H" in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9)
+USE_CUDA_GRAPH = True and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+
+
+FLA_CACHE_RESULTS = os.getenv("FLA_CACHE_RESULTS", "1") == "1"
+SUPPORTS_AUTOTUNE_CACHE = "cache_results" in inspect.signature(triton.autotune).parameters
+autotune_cache_kwargs = {"cache_results": FLA_CACHE_RESULTS} if SUPPORTS_AUTOTUNE_CACHE else {}
+
+
+# error check，copy from
+def get_abs_err(x, y):
+    return (x.detach() - y.detach()).flatten().abs().max().item()
+
+
+def get_err_ratio(x, y):
+    err = (x.detach() - y.detach()).flatten().square().mean().sqrt().item()
+    base = (x.detach()).flatten().square().mean().sqrt().item()
+    return err / (base + 1e-8)
+
+
+def assert_close(prefix, ref, tri, ratio, warning=False, err_atol=1e-6):
+    abs_atol = get_abs_err(ref, tri)
+    msg = f"{prefix:>16} diff: {abs_atol:.6f} ratio: {get_err_ratio(ref, tri):.6f}"
+    print(msg)
+    error_rate = get_err_ratio(ref, tri)
+    if abs_atol <= err_atol:
+        return
+    if warning or (error_rate < 0.01 or abs_atol <= 0.3):
+        if error_rate > ratio:
+            warnings.warn(msg, stacklevel=2)
+    else:
+        assert error_rate < ratio, msg
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            last_args is not None
+            and last_kwargs is not None
+            and len(args) == len(last_args)
+            and len(kwargs) == len(last_kwargs)
+            and all(a is b for a, b in zip(args, last_args))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor,
+    chunk_size: int,
+) -> torch.LongTensor:
+    indices = torch.cat([torch.arange(n) for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()])
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+
+
+# @functools.cache
+# def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+#     try:
+#         return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)['multiprocessor_count']
+#     except BaseException:
+#         # Maybe we use a NPU device.
+#         if triton.runtime.driver.active.get_current_target().backend == 'npu':
+#             return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)['num_vectorcore']
+#         else:
+#             return 1
+@functools.cache
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    """
+    Compatible across Triton versions:
+    - 2.0.x
+    - 2.1.0
+    - 2.2.x and above
+    Supports CUDA and NPU.
+    """
+
+    # ---- Try the newer Triton 2.2+ API ----
+    try:
+        drv = triton.runtime.driver.active
+        props = drv.utils.get_device_properties(tensor_idx)
+        return props.get("multiprocessor_count") or props.get("num_vectorcore") or 1
+    except Exception:
+        pass
+
+    # ---- Fallback: Triton 2.0 / 2.1 API ----
+    try:
+        cuda = triton.runtime.driver.CudaDriver
+        dev = cuda.get_current_device()
+        props = cuda.get_device_properties(dev)
+        return props.get("multiprocessor_count", 1)
+    except Exception:
+        pass
+
+    return 1
+
+
+def input_guard(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args)
+        contiguous_kwargs = {k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) for k, v in kwargs.items()}
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+@functools.cache
+def check_pytorch_version(version_s: str = "2.4") -> bool:
+    return version.parse(torch.__version__) >= version.parse(version_s)
+
+
+if check_pytorch_version("2.4"):
+    device = "cuda"
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+else:
+    assert device == "cuda", "Only cuda device is supported for PyTorch version < 2.4.0."
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)["max_shared_mem"] for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        return [-1]
+
+
+@functools.cache
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
diff --git a/examples/kda/FLA_KDA/fla_wy_fast.py b/examples/kda/FLA_KDA/fla_wy_fast.py
new file mode 100644
index 000000000..a042c2a5f
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_wy_fast.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, exp2, autotune_cache_kwargs
+
+
+@triton.heuristics(
+    {
+        "STORE_QG": lambda args: args["qg"] is not None,
+        "STORE_KG": lambda args: args["kg"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"DOT_PRECISION": DOT_PRECISION}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+        for DOT_PRECISION in (["tf32x3", "ieee"])
+    ],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    q,
+    k,
+    qg,
+    kg,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    gk,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    STORE_QG: tl.constexpr,
+    STORE_KG: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_u = tl.make_block_ptr(u + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, input_precision=DOT_PRECISION)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_b[:, None]  # 乘beta
+
+        p_gk = tl.make_block_ptr(gk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kb *= exp2(b_gk)
+        if STORE_QG:
+            p_q = tl.make_block_ptr(q + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_qg = tl.make_block_ptr(qg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_qg = b_q * exp2(b_gk)
+            tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty), boundary_check=(0, 1))
+        if STORE_KG:
+            last_idx = min(i_t * BT + BT, T) - 1
+            o_k = i_k * BK + tl.arange(0, BK)
+            m_k = o_k < K
+            b_gn = tl.load(gk + ((bos + last_idx) * H + i_h) * K + o_k, mask=m_k, other=0.0)  # chunk的最后一个g
+            b_kg = b_k * tl.where((i_t * BT + tl.arange(0, BT) < T)[:, None], exp2(b_gn[None, :] - b_gk), 0)
+            p_kg = tl.make_block_ptr(kg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty), boundary_check=(0, 1))
+
+        b_w = tl.dot(b_A, b_kb.to(b_k.dtype))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [2, 4] for num_stages in [2, 3, 4]],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def prepare_wy_repr_bwd_kernel(
+    k,
+    v,
+    beta,
+    gk,
+    A,
+    dA,
+    dw,
+    du,
+    dk,
+    dk2,
+    dv,
+    db,
+    dg,
+    dg2,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    p_b = tl.make_block_ptr(beta + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_db = tl.make_block_ptr(db + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+
+    b_b = tl.load(p_b, boundary_check=(0,))
+    b_db = tl.zeros([BT], dtype=tl.float32)
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_dA = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk = tl.make_block_ptr(dk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk2 = tl.make_block_ptr(dk2 + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dw = tl.make_block_ptr(dw + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dg = tl.make_block_ptr(dg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dg2 = tl.make_block_ptr(dg2 + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+
+        # [BT, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        p_gk = tl.make_block_ptr(gk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_gk_exp = exp2(tl.load(p_gk, boundary_check=(0, 1)))
+        b_kbg = b_k * b_b[:, None] * b_gk_exp
+        b_dw = tl.load(p_dw, boundary_check=(0, 1))
+
+        b_dA += tl.dot(b_dw, tl.trans(b_kbg).to(b_dw.dtype))
+        b_dkbg = tl.dot(b_A, b_dw)
+        b_dk = b_dkbg * b_gk_exp * b_b[:, None] + tl.load(p_dk, boundary_check=(0, 1))
+        b_db += tl.sum(b_dkbg * b_k * b_gk_exp, 1)
+        b_dg = b_kbg * b_dkbg + tl.load(p_dg, boundary_check=(0, 1))
+
+        tl.store(p_dk2, b_dk.to(p_dk2.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_dg2, b_dg.to(p_dg2.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_du = tl.make_block_ptr(du + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_du = tl.load(p_du, boundary_check=(0, 1))
+        b_dA += tl.dot(b_du, tl.trans(b_vb))
+        b_dvb = tl.dot(b_A, b_du)
+        b_dv = b_dvb * b_b[:, None]
+        b_db += tl.sum(b_dvb * b_v, 1)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
+    b_dA = tl.where(m_A, b_dA, 0)
+    b_dA = tl.dot(b_dA.to(b_A.dtype), b_A)
+    b_dA = tl.dot(b_A, b_dA.to(b_A.dtype))
+
+    b_dA = tl.where(m_A, -b_dA, 0)
+
+    # if using gk, save dA first and handle dk in another kernel
+    p_dA = tl.make_block_ptr(dA + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_dA, b_dA.to(p_dA.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0,))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    A: torch.Tensor,
+    q: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = A.shape[-1]
+    BK = 64
+    BV = 64
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+    qg = torch.empty_like(q) if q is not None else None
+    kg = torch.empty_like(k) if gk is not None else None
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        q=q,
+        k=k,
+        qg=qg,
+        kg=kg,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        gk=gk,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u, qg, kg
+
+
+def prepare_wy_repr_bwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    gk: torch.Tensor,
+    A: torch.Tensor,
+    dk: torch.Tensor,
+    dw: torch.Tensor,
+    du: torch.Tensor,
+    dg: torch.Tensor,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = 64
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    CONST_TILING = 64
+    BK = min(max(triton.next_power_of_2(K), 16), CONST_TILING)
+    BV = min(max(triton.next_power_of_2(V), 16), CONST_TILING)
+
+    dk2 = torch.empty_like(dk, dtype=torch.float)
+    dv = torch.empty_like(v)
+    dg2 = torch.empty_like(gk, dtype=torch.float)
+    dA = torch.empty_like(A, dtype=torch.float)
+    db = torch.empty_like(beta, dtype=torch.float)
+    prepare_wy_repr_bwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        gk=gk,
+        A=A,
+        dA=dA,
+        dw=dw,
+        du=du,
+        dk=dk,
+        dk2=dk2,
+        dv=dv,
+        db=db,
+        dg=dg,
+        dg2=dg2,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    dk = dk2
+    dg = dg2
+
+    return dk, dv, db, dg, dA
diff --git a/examples/kda/README.md b/examples/kda/README.md
new file mode 100644
index 000000000..f445a9f09
--- /dev/null
+++ b/examples/kda/README.md
@@ -0,0 +1,7 @@
+# KDA kernel implementation with TileLang
+## Requirement
+- TileLang: 0.1.6.post2+cuda.git729e66ca
+- triton: 3.2.0
+- FLA: commit 9714c5(used for comparison)
+
+We copy the needed files and function from flash-linear-attention to the FLA_KDA/ for easily comparison.
diff --git a/examples/kda/chunk_bwd_dqkwg.py b/examples/kda/chunk_bwd_dqkwg.py
new file mode 100644
index 000000000..d3d4df4b4
--- /dev/null
+++ b/examples/kda/chunk_bwd_dqkwg.py
@@ -0,0 +1,274 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_inter import chunk_kda_bwd_dqkwg
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    gate_dtype,
+):
+    BS = S // chunk_size
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    v_new = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    w = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    g = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    h = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    do = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dh = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+
+    return q, k, v_new, w, g, h, dv, do, dh
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    gate_dtype,
+):
+    dq = torch.randn(B, S, H, DK, dtype=torch.float32).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=torch.float32).cuda()
+    dw = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    return dq, dk, dw, dg
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-4, -3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def chunk_bwd_dqkwg(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    chunk_size,
+    input_dtype,
+    gate_dtype,
+    block_DK=32,
+    block_DV=32,
+    threads=32,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = S // block_S
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    H_shape = (B, BS, H, DK, DV)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(K_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        G: T.Tensor(K_shape, dtype=gate_dtype),
+        h: T.Tensor(H_shape, dtype=input_dtype),
+        dv: T.Tensor(V_shape, dtype=input_dtype),
+        DO: T.Tensor(V_shape, dtype=input_dtype),
+        Dh: T.Tensor(H_shape, dtype=input_dtype),
+        dq: T.Tensor(K_shape, dtype=T.float32),
+        dk: T.Tensor(K_shape, dtype=T.float32),
+        dw: T.Tensor(K_shape, dtype=gate_dtype),
+        dg: T.Tensor(K_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            chunk_last_idx = T.min(S, (bs + 1) * block_S) - 1
+
+            dgkn_fragment = T.alloc_fragment((block_DK), dtype=T.float32)
+            dgkn_fragment_tmp = T.alloc_fragment((block_DK,), dtype=T.float32)
+            dq_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dw_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dgk_shared = T.alloc_shared((block_S, block_DK), dtype=T.float32)
+
+            h_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dh_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dgkn_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)  # d of last token in a chunk
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            DO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            DV_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)  # chunk G
+            Gn_shared = T.alloc_shared((block_DK), dtype=input_dtype)  # chunk last token G
+            Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+
+            dkkn_shared = T.alloc_shared((block_S, block_DK), dtype=T.float32)
+            pp_shared = T.alloc_shared((block_DK), dtype=T.float32)
+
+            T.clear(dgkn_fragment)
+            T.clear(dq_fragment)
+            T.clear(dk_fragment)
+            T.clear(dw_fragment)
+
+            T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], G_shared)
+            T.copy(G[bb, chunk_last_idx, bh, bk * block_DK : (bk + 1) * block_DK], Gn_shared)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(Dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(DO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], DO_shared)
+                T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], DV_shared)
+                # += reduce_sum
+                for i_k1, i_v1 in T.Parallel(block_DK, block_DV):
+                    dgkn_shared[i_k1, i_v1] = h_shared[i_k1, i_v1] * dh_shared[i_k1, i_v1]
+                T.reduce_sum(dgkn_shared, dgkn_fragment_tmp, dim=1, clear=True)  # [block_DK]
+                for i_ks in T.Parallel(block_DK):
+                    dgkn_fragment[i_ks] += dgkn_fragment_tmp[i_ks]
+                T.gemm(DO_shared, h_shared, dq_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+                T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+                T.gemm(DV_shared, h_shared, dw_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+            # chunk last token
+            for i_k0 in T.Parallel(block_DK):
+                dgkn_fragment[i_k0] = dgkn_fragment[i_k0] * T.exp2(Gn_shared[i_k0])
+
+            for i_s, i_k in T.Parallel(block_S, block_DK):
+                dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
+                dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale * T.exp2(G_shared[i_s, i_k])
+                dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp2(Gn_shared[i_k] - G_shared[i_s, i_k])
+
+            T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+            T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+            T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], Q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], K_shared)
+
+            for i_s2, i_k2 in T.Parallel(block_S, block_DK):
+                dkkn_shared[i_s2, i_k2] = dk_fragment[i_s2, i_k2] * K_shared[i_s2, i_k2]
+            T.reduce_sum(dkkn_shared, pp_shared, dim=0, clear=True)
+            for i_k3 in T.Parallel(block_DK):
+                pp_shared[i_k3] += dgkn_fragment[i_k3]
+
+            for i_s4, i_k4 in T.Parallel(block_S, block_DK):
+                dgk_shared[i_s4, i_k4] = (
+                    Q_shared[i_s4, i_k4] * dq_fragment[i_s4, i_k4]
+                    - K_shared[i_s4, i_k4] * dk_fragment[i_s4, i_k4]
+                    + T.if_then_else(chunk_last_idx == bs * block_S + i_s4, pp_shared[i_k4], 0.0)
+                )
+
+            T.copy(dgk_shared, dg[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    gate_dtype,
+    qk_dtype,
+    chunk_size,
+    use_gk=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    q, k, v_new, w, g, h, dv, do, dh = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, gate_dtype))
+
+    dq_ref, dk_ref, dw_ref, dg_ref = chunk_kda_bwd_dqkwg(
+        q=q,
+        k=k,
+        v=v_new,
+        w=w,
+        g=g,
+        h=h,
+        dv=dv,
+        do=do,
+        dh=dh,
+        scale=scale,
+    )
+
+    dq, dk, dw, dg = prepare_output(B, S, H, DK, DV, chunk_size, getattr(torch, gate_dtype))
+    kernel = chunk_bwd_dqkwg(
+        B=B, S=S, H=H, DK=DK, DV=DV, scale=scale, chunk_size=chunk_size, input_dtype=input_dtype, gate_dtype=gate_dtype
+    )
+    dq, dk, dw, dg = kernel(q, k, v_new, g, h, dv, do, dh)
+
+    compare_tensors("dq", dq_ref, dq)
+    compare_tensors("dk", dk_ref, dk)
+    compare_tensors("dw", dw_ref, dw)
+    compare_tensors("dg", dg_ref, dg)
+
+    fla_time = do_bench(
+        chunk_kda_bwd_dqkwg,
+        q=q,
+        k=k,
+        v=v_new,
+        w=w,
+        g=g,
+        h=h,
+        dv=dv,
+        do=do,
+        dh=dh,
+        scale=scale,
+    )
+    tilelang_time = do_bench(kernel, q, k, v_new, g, h, dv, do, dh)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="float32",
+        gate_dtype="float32",  # gate must be float32
+        qk_dtype="float32",
+        chunk_size=64,
+        use_gk=True,
+        use_initial_state=True,
+        store_final_state=True,
+        save_new_value=True,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_dv.py b/examples/kda/chunk_bwd_dv.py
new file mode 100644
index 000000000..cdbe0a899
--- /dev/null
+++ b/examples/kda/chunk_bwd_dv.py
@@ -0,0 +1,150 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import sys  # noqa: F401
+
+from FLA_KDA.fla_chunk_o import chunk_bwd_dv_local
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(1)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    do_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=do_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=do_dtype).cuda()
+    DO = torch.randn(B, S, H, DV, dtype=do_dtype).cuda()
+    A = torch.randn(B, S, H, chunk_size, dtype=input_dtype).cuda()
+    return q, k, DO, A
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    output_dtype,
+):
+    dv = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return dv
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=5)
+@tilelang.jit(out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_bwd_kernel_dv_local(
+    B,
+    S,
+    H,
+    DV,
+    input_dtype,
+    output_dtype,
+    do_dtype,
+    chunk_size,
+    block_DV=128,
+    threads=128,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    DO_shape = (B, S, H, DV)
+    A_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        DO: T.Tensor(DO_shape, dtype=do_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dv: T.Tensor(DO_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((BS, BS), dtype=do_dtype)
+            DO_shared = T.alloc_shared((BS, block_DV), dtype=do_dtype)
+            dv_fragment = T.alloc_fragment((BS, block_DV), dtype=T.float32)
+            dv_shared = T.alloc_shared((BS, block_DV), dtype=output_dtype)
+
+            T.copy(A[bb, bs * BS : (bs + 1) * BS, bh, :], A_shared)
+            for i_s1, i_s2 in T.Parallel(BS, BS):
+                A_shared[i_s1, i_s2] = T.if_then_else(i_s1 >= i_s2, A_shared[i_s1, i_s2], 0.0)
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(DO[bb, bs * BS : (bs + 1) * BS, bh, i_v * block_DV : (i_v + 1) * block_DV], DO_shared)
+                T.gemm(A_shared, DO_shared, dv_fragment, transpose_A=True, clear_accum=True)  # transpose_A: A^T
+                T.copy(dv_fragment, dv_shared)
+                T.copy(dv_shared, dv[bb, bs * BS : (bs + 1) * BS, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    do_dtype,
+    output_dtype,
+    chunk_size,
+):
+    q, k, DO, A = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, do_dtype))
+    dv_ref = chunk_bwd_dv_local(q, k, do=DO, A=A)
+
+    dv_tilelang = prepare_output(B, S, H, DV, chunk_size, getattr(torch, output_dtype))
+    kernel = tilelang_chunk_bwd_kernel_dv_local(
+        B=B,
+        S=S,
+        H=H,
+        DV=DV,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        do_dtype=do_dtype,
+        chunk_size=chunk_size,
+    )
+    dv_tilelang = kernel(DO, A)
+    compare_tensors("dv", dv_ref, dv_tilelang)
+
+    fla_time = do_bench(chunk_bwd_dv_local, q, k, do=DO, A=A)
+    tilelang_time = do_bench(kernel, DO, A)
+    print("fla_time: ", fla_time)
+    print("tilelang_time: ", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        do_dtype="float32",
+        output_dtype="bfloat16",
+        chunk_size=64,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_gla_dA.py b/examples/kda/chunk_bwd_gla_dA.py
new file mode 100644
index 000000000..913fa9171
--- /dev/null
+++ b/examples/kda/chunk_bwd_gla_dA.py
@@ -0,0 +1,147 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_o import chunk_gla_bwd_dA
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(1)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    input_dtype,
+    do_dtype,
+):
+    DO = torch.randn(B, S, H, DV, dtype=do_dtype).cuda()
+    V_new = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return DO, V_new
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    d_type,
+):
+    dA = torch.empty(B, S, H, chunk_size, dtype=d_type).cuda()
+    return dA
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=5)
+@tilelang.jit(out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_bwd_kernel_dv_local(
+    B,
+    S,
+    H,
+    DV,
+    scale,
+    input_dtype,
+    da_dtype,
+    do_dtype,
+    chunk_size,
+    block_DV=128,
+    threads=128,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    DO_shape = (B, S, H, DV)
+    V_shape = (B, S, H, DV)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        DO: T.Tensor(DO_shape, dtype=do_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=da_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            do_shared = T.alloc_shared((block_S, block_DV), dtype=do_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=do_dtype)
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=T.float32)
+
+            T.clear(dA_fragment)
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(DO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], do_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.gemm(do_shared, V_shared, dA_fragment, transpose_B=True)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_fragment[i_s1, i_s2] = T.if_then_else(i_s1 >= i_s2, dA_fragment[i_s1, i_s2] * scale, 0.0)  # 下三角矩阵
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, 0:block_S])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    do_dtype,
+    da_dtype,
+    chunk_size,
+):
+    DO, V_new = prepare_input(B, S, H, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, do_dtype))
+    print(DO.dtype, V_new.dtype)
+    dA_ref = chunk_gla_bwd_dA(v=V_new, do=DO, scale=scale)
+
+    dA_tilelang = prepare_output(B, S, H, DV, chunk_size, getattr(torch, da_dtype))
+    kernel = tilelang_chunk_bwd_kernel_dv_local(
+        B=B,
+        S=S,
+        H=H,
+        DV=DV,
+        scale=scale,
+        input_dtype=input_dtype,
+        da_dtype=da_dtype,
+        do_dtype=do_dtype,
+        chunk_size=chunk_size,
+    )
+    dA_tilelang = kernel(DO, V_new)
+    compare_tensors("dA", dA_ref, dA_tilelang)
+    fla_time = do_bench(chunk_gla_bwd_dA, v=V_new, do=DO, scale=scale)
+    tilelang_time = do_bench(kernel, DO, V_new)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        do_dtype="bfloat16",
+        da_dtype="float32",
+        chunk_size=64,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_intra.py b/examples/kda/chunk_bwd_intra.py
new file mode 100644
index 000000000..6c66732b4
--- /dev/null
+++ b/examples/kda/chunk_bwd_intra.py
@@ -0,0 +1,493 @@
+# Reference: FLA_KDA/fla_chunk_intra.py
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_intra import chunk_kda_bwd_intra
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(0)
+torch.set_printoptions(profile="full")
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BT = chunk_size
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    g = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+
+    # dAqk and dAkk are gradients w.r.t. Aqk and Akk
+    # Shape: (B, S, H, BT)
+    dAqk = torch.randn(B, S, H, BT, dtype=input_dtype).cuda()
+    dAkk = torch.randn(B, S, H, BT, dtype=input_dtype).cuda()
+
+    # Initial gradients (will be updated by the kernel)
+    dq = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    db = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+
+    return q, k, g, beta, dAqk, dAkk, dq, dk, db, dg
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    NK,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    dq = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    db = torch.empty(NK, B, S, H, dtype=output_dtype).cuda()
+    dg = torch.empty(B, S, H, DK, dtype=gate_dtype).cuda()
+    return dq, dk, db, dg
+
+
+def get_configs():
+    import itertools
+
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(threads, num_stages))
+
+    configs = [{"threads": c[0], "num_stages": c[1]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=5, rep=5)
+@tilelang.jit(
+    out_idx=[-4, -3, -2, -1],
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+)
+def tilelang_chunk_bwd_intra(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK,
+    block_BC=16,
+    threads=128,
+    num_stages=0,
+):
+    BT = chunk_size
+    BC = block_BC  # sub-chunk size, typically 16
+
+    NC = BT // BC  # number of sub-chunks
+    NT = T.ceildiv(S, BT)
+    NK = T.ceildiv(DK, block_DK)  # number of K blocks
+
+    K_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H, DK)
+    BT_shape = (B, S, H, BT)  # for dAqk and dAkk
+
+    dq_shape = (B, S, H, DK)
+    dk_shape = (B, S, H, DK)
+    db_shape = (B, S, H)
+    db2_shape = (NK, B, S, H)
+    dg_shape = (B, S, H, DK)
+
+    @T.prim_func
+    def kernel(
+        # input
+        q: T.Tensor(K_shape, dtype=input_dtype),
+        k: T.Tensor(K_shape, dtype=input_dtype),
+        g: T.Tensor(G_shape, dtype=gate_dtype),
+        beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        dAqk: T.Tensor(BT_shape, dtype=input_dtype),
+        dAkk: T.Tensor(BT_shape, dtype=input_dtype),
+        dq: T.Tensor(dq_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=input_dtype),
+        db: T.Tensor(db_shape, dtype=input_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # output
+        dq2: T.Tensor(dq_shape, dtype=output_dtype),
+        dk2: T.Tensor(dk_shape, dtype=output_dtype),
+        db2: T.Tensor(db2_shape, dtype=output_dtype),
+        dg2: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DK, block_DK) * NC, NT, B * H, threads=threads) as (i_kc, i_t, i_bh):
+            i_k, i_i = i_kc // NC, i_kc % NC
+            bb, bh = i_bh // H, i_bh % H
+
+            # actual sub-chunk index
+            i_ti = i_t * BT + i_i * BC
+
+            # current sub-chunk data
+            q_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            k_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            beta_shared = T.alloc_shared((BC,), dtype=input_dtype)
+            g_current_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            gn_shared = T.alloc_shared((block_DK,), dtype=gate_dtype)  # last token's g in current sub-chunk
+
+            dq_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            dk_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            dg_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+
+            # Allocate fragments
+            dq2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            dk2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            dg2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            db_fragment = T.alloc_fragment((BC,), dtype=accum_dtype)
+
+            # Initialize fragments
+            T.clear(dq2_fragment)
+            T.clear(dk2_fragment)
+            T.clear(dg2_fragment)
+            T.clear(db_fragment)
+
+            # Temporary shared memory for previous sub-chunks
+            k_prev_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            g_prev_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            dAqk_prev_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+            dAkk_prev_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+
+            # Temporary fragment for b_kg computation
+            kg_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+
+            kj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            gkj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            kgj_fragment = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            dAqk_col = T.alloc_shared((BC,), dtype=input_dtype)
+            dAkk_col = T.alloc_shared((BC,), dtype=input_dtype)
+
+            # Load g, q, k for current sub-chunk
+            T.copy(q[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], q_shared)
+            T.copy(k[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_shared)
+            T.copy(g[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_current_shared)
+            T.copy(beta[bb, i_ti : i_ti + BC, bh], beta_shared)
+
+            if i_i > 0:
+                chunk_first_idx = i_ti  # chunk first token idx
+
+                T.copy(g[bb, chunk_first_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gn_shared)  # Get the first token's g value (b_gn)
+
+                # Loop over previous sub-chunks (i_j from 0 to i_i-1)
+                # Since i_i is computed from i_kc % NC and NC is small, we can use conditional blocks
+                # Process each possible previous sub-chunk with conditional execution
+                for i_j in T.Pipelined(i_i, num_stages=num_stages):  # i_j is index ofprevious sub_chunks
+                    prev_ti = i_t * BT + i_j * BC
+                    T.copy(k[bb, prev_ti : prev_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_prev_shared)
+                    T.copy(g[bb, prev_ti : prev_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_prev_shared)
+
+                    T.copy(dAqk[bb, i_ti : i_ti + BC, bh, i_j * BC : (i_j + 1) * BC], dAqk_prev_shared)
+                    T.copy(dAkk[bb, i_ti : i_ti + BC, bh, i_j * BC : (i_j + 1) * BC], dAkk_prev_shared)
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        kg_fragment[i_bc, i_k2] = k_prev_shared[i_bc, i_k2] * T.exp2(gn_shared[i_k2] - g_prev_shared[i_bc, i_k2])
+
+                    T.gemm(dAqk_prev_shared, kg_fragment, dq2_fragment, clear_accum=False)
+                    T.gemm(dAkk_prev_shared, kg_fragment, dk2_fragment, clear_accum=False)
+
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    gqn = T.exp2(g_current_shared[i_bc, i_k2] - gn_shared[i_k2])
+                    dq2_fragment[i_bc, i_k2] = dq2_fragment[i_bc, i_k2] * gqn
+                    dk2_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * gqn
+
+            # Process current sub-chunk diagonal
+            loop_length = T.min(BC, S - i_t * BT - i_i * BC)
+            for j in T.Pipelined(loop_length, num_stages=num_stages):
+                token_j_idx = i_ti + j
+
+                T.copy(k[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], kj_shared)
+                T.copy(g[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gkj_shared)
+                T.copy(dAqk[bb, i_ti : i_ti + BC, bh, i_i * BC + j], dAqk_col)
+                T.copy(dAkk[bb, i_ti : i_ti + BC, bh, i_i * BC + j], dAkk_col)
+
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    kgj_fragment[i_bc, i_k2] = kj_shared[i_k2] * T.exp2(g_current_shared[i_bc, i_k2] - gkj_shared[i_k2])
+                    dq2_fragment[i_bc, i_k2] += T.if_then_else(i_bc >= j, dAqk_col[i_bc] * kgj_fragment[i_bc, i_k2], 0.0)
+                    dk2_fragment[i_bc, i_k2] += T.if_then_else(i_bc >= j, dAkk_col[i_bc] * kgj_fragment[i_bc, i_k2], 0.0)
+
+            # Compute b_db = sum(b_dk2 * b_k, dim=1)
+            dk2_k_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_k_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * k_shared[i_bc, i_k2]
+            T.reduce_sum(dk2_k_fragment, db_fragment, dim=1, clear=True)
+
+            # b_dk2 *= b_b[:, None]
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * beta_shared[i_bc]
+
+            # Compute b_dg2 = b_q * b_dq2 (before adding dq to dq2)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dg2_fragment[i_bc, i_k2] = q_shared[i_bc, i_k2] * dq2_fragment[i_bc, i_k2]
+
+            # Load dq and compute b_dq2 = b_dq2 + b_dq
+            T.copy(dq[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dq_shared)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dq2_fragment[i_bc, i_k2] = dq2_fragment[i_bc, i_k2] + dq_shared[i_bc, i_k2]
+
+            # # Store results
+            T.copy(dq2_fragment, dq2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+            T.copy(db_fragment, db2[i_k, bb, i_ti : i_ti + BC, bh])
+
+            # Initialize dkt_fragment for processing subsequent sub-chunks and lower triangular part
+            dkt_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            T.clear(dkt_fragment)
+
+            # Temporary shared memory for subsequent sub-chunks
+            q_next_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            k_next_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            g_next_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            beta_next_shared = T.alloc_shared((BC,), dtype=input_dtype)
+            dAqk_next_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+            dAkk_next_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+
+            # Temporary fragments for computation
+            gkn_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            qg_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            kbg_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            kbg_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            dkt_temp_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            # T.use_swizzle(10)
+
+            NC_actual = T.min(NC, T.ceildiv(S - i_t * BT, BC))  # Process subsequent sub-chunks (i_j from i_i+1 to NC-1)
+            if i_i < NC_actual - 1:
+                # Get the last token's g value in current sub-chunk
+                chunk_last_idx = T.min(S, i_ti + BC) - 1
+                gn_last_shared = T.alloc_shared((block_DK,), dtype=gate_dtype)
+                T.copy(g[bb, chunk_last_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gn_last_shared)
+
+                # Loop over subsequent sub-chunks
+                for i_j in T.Pipelined(i_i + 1, NC_actual, num_stages=num_stages):
+                    i_tj = i_t * BT + i_j * BC
+
+                    T.copy(q[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], q_next_shared)
+                    T.copy(k[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_next_shared)
+                    T.copy(g[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_next_shared)
+                    T.copy(beta[bb, i_tj : i_tj + BC, bh], beta_next_shared)
+
+                    T.copy(dAqk[bb, i_tj : i_tj + BC, bh, i_i * BC : (i_i + 1) * BC], dAqk_next_shared)  # [BC, BC] need transpose
+                    T.copy(dAkk[bb, i_tj : i_tj + BC, bh, i_i * BC : (i_i + 1) * BC], dAkk_next_shared)  # [BC, BC] need transpose
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        # kbg = k * beta
+                        kbg_fragment[i_bc, i_k2] = k_next_shared[i_bc, i_k2] * beta_next_shared[i_bc]
+                        gkn_shared[i_bc, i_k2] = T.if_then_else(
+                            i_tj + i_bc < S, T.exp2(g_next_shared[i_bc, i_k2] - gn_last_shared[i_k2]), 0.0
+                        )
+
+                    # Compute qg and kbg
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        qg_shared[i_bc, i_k2] = q_next_shared[i_bc, i_k2] * gkn_shared[i_bc, i_k2]
+                        kbg_shared[i_bc, i_k2] = kbg_fragment[i_bc, i_k2] * gkn_shared[i_bc, i_k2]
+
+                    # Accumulate: dkt += dAqk^T @ qg + dAkk^T @ kbg
+                    # Use transpose_A=True because dAqk/dAkk are loaded in (T, BT) layout but we need (BT, T) for gemm
+                    T.gemm(dAqk_next_shared, qg_shared, dkt_temp_fragment, transpose_A=True, clear_accum=True)
+                    T.gemm(dAkk_next_shared, kbg_shared, dkt_temp_fragment, transpose_A=True, clear_accum=False)
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        dkt_fragment[i_bc, i_k2] = dkt_fragment[i_bc, i_k2] + dkt_temp_fragment[i_bc, i_k2]
+
+                # Scale dkt by exp2(gn_last - g_current)
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    g_scale = T.exp2(gn_last_shared[i_k2] - g_current_shared[i_bc, i_k2])
+                    dkt_fragment[i_bc, i_k2] = dkt_fragment[i_bc, i_k2] * g_scale
+
+            # Process lower triangular part of current sub-chunk diagonal
+            # This corresponds to j <= i_bc in the diagonal block
+            qj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            kj_shared_lower = T.alloc_shared((block_DK,), dtype=T.float32)
+            gj_shared_lower = T.alloc_shared((block_DK,), dtype=T.float32)
+            bj_local = T.alloc_local((1), dtype=input_dtype)
+            dAqk_col_lower = T.alloc_shared((BC,), dtype=input_dtype)
+            dAkk_col_lower = T.alloc_shared((BC,), dtype=input_dtype)
+
+            gkq_fragment = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            # dkt_lower_temp = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            kbj_fragment = T.alloc_fragment((block_DK,), dtype=T.float32)
+
+            max_token_j_idx = T.min(S, i_ti + BC)
+            for j in T.Pipelined(BC, num_stages=num_stages):
+                token_j_idx = i_ti + j
+
+                if token_j_idx < max_token_j_idx:
+                    T.copy(q[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], qj_shared)  # [BK]
+                    T.copy(k[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], kj_shared_lower)
+                    T.copy(g[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gj_shared_lower)
+
+                    bj_local[0] = beta[bb, token_j_idx, bh]
+                    T.copy(dAqk[bb, token_j_idx, bh, i_i * BC : (i_i + 1) * BC], dAqk_col_lower)  # [BC]
+                    T.copy(dAkk[bb, token_j_idx, bh, i_i * BC : (i_i + 1) * BC], dAkk_col_lower)
+
+                    # Compute kbj = kj * bj
+                    for i_k2 in T.Parallel(block_DK):
+                        kbj_fragment[i_k2] = kj_shared_lower[i_k2] * bj_local[0]
+                    # Compute gkq = exp2(gj - g_current)
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        gkq_fragment[i_bc, i_k2] = T.exp2(gj_shared_lower[i_k2] - g_current_shared[i_bc, i_k2])
+
+                    # Accumulate: dkt += (dAkk * kbj + dAqk * qj) * gkq for i_bc <= j
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        dkt_fragment[i_bc, i_k2] += T.if_then_else(
+                            i_bc <= j,
+                            (dAkk_col_lower[i_bc] * kbj_fragment[i_k2] + dAqk_col_lower[i_bc] * qj_shared[i_k2]) * gkq_fragment[i_bc, i_k2],
+                            0.0,
+                        )
+
+            # Load dk and dg
+            T.copy(dk[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
+            T.copy(dg[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dg_shared)
+
+            # Update dg2: dg2 += (dk2 - dkt) * k + dg
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dg2_fragment[i_bc, i_k2] = (
+                    dg2_fragment[i_bc, i_k2]
+                    + (dk2_fragment[i_bc, i_k2] - dkt_fragment[i_bc, i_k2]) * k_shared[i_bc, i_k2]
+                    + dg_shared[i_bc, i_k2]
+                )
+
+            # Update dk2: dk2 += dk + dkt
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_fragment[i_bc, i_k2] += dk_shared[i_bc, i_k2] + dkt_fragment[i_bc, i_k2]
+
+            # Store dk2 and dg2
+            T.copy(dk2_fragment, dk2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+            T.copy(dg2_fragment, dg2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    threads=128,
+    num_stages=0,
+    cu_seqlens=None,
+    chunk_indices=None,
+):
+    q, k, g, beta, dAqk, dAkk, dq, dk, db, dg = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    # Reference implementation
+    dq_ref, dk_ref, db_ref, dg_ref = chunk_kda_bwd_intra(
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dk=dk,
+        db=db,
+        dg=dg,
+    )
+    block_DK = min(64, tilelang.math.next_power_of_2(DK))
+    NK = (DK + block_DK - 1) // block_DK
+    # TileLang implementation
+    kernel = tilelang_chunk_bwd_intra(
+        B=B,
+        S=S,
+        H=H,
+        DK=DK,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        accum_dtype=accum_dtype,
+        gate_dtype=gate_dtype,
+        state_dtype=state_dtype,
+        chunk_size=chunk_size,
+        block_DK=block_DK,
+    )
+
+    dq_tilelang, dk_tilelang, db_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, chunk_size, NK, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dq_tilelang, dk_tilelang, db_tilelang, dg_tilelang = kernel(q, k, g, beta, dAqk, dAkk, dq, dk, db, dg)
+    db_tilelang = db_tilelang.sum(0).add_(db)
+    dg_tilelang = chunk_local_cumsum(
+        dg_tilelang,
+        chunk_size=chunk_size,
+        reverse=True,
+    )
+
+    compare_tensors("dq", dq_tilelang, dq_ref)
+    compare_tensors("dk", dk_tilelang, dk_ref)
+    compare_tensors("db", db_tilelang, db_ref)
+    compare_tensors("dg", dg_tilelang, dg_ref)
+
+    fla_time = do_bench(
+        chunk_kda_bwd_intra,
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dk=dk,
+        db=db,
+        dg=dg,
+    )
+    tilelang_time = do_bench(kernel, q, k, g, beta, dAqk, dAkk, dq, dk, db, dg)
+    print(f"Fla time: {fla_time}")
+    print(f"Tilelang time: {tilelang_time}")
+
+
+def main():
+    DK = 128
+    run_test(
+        B=1,
+        S=8192,
+        H=8,
+        DK=DK,
+        input_dtype=T.float32,
+        output_dtype=T.float32,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
+        chunk_size=64,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_delta_bwd.py b/examples/kda/chunk_delta_bwd.py
new file mode 100644
index 000000000..8c22488ca
--- /dev/null
+++ b/examples/kda/chunk_delta_bwd.py
@@ -0,0 +1,309 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_delta import chunk_gated_delta_rule_bwd_dhu
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+import torch.nn.functional as F
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda() * 0.01
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    # Note: G should be in logspace and do chunkwise cumsum
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    G = chunk_local_cumsum(G, chunk_size)
+
+    h0 = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dht = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dO = torch.randn(B, S, H, DV, dtype=input_dtype).cuda() * 0.01
+
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return Q, K, W, G, h0, dht, dO, dv
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+    dh = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    dh0 = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    dv2 = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return dh, dh0, dv2
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(out_idx=[-3, -2, -1])
+def tilelang_chunk_gated_delta_rule_bwd_dhu(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_gk=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    # kernel config
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    block_S = chunk_size
+    # Should support cu_seqlen
+    BS = S // block_S
+
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    W_shape = (B, S, H, DK)
+    G_shape = (B, S, H, DK)
+    h0_shape = (B, H, DK, DV)
+    dht_shape = (B, H, DK, DV)
+    dO_shape = (B, S, H, DV)
+    dv_shape = (B, S, H, DV)
+
+    dh_shape = (B, BS, H, DK, DV)
+    dh0_shape = (B, H, DK, DV)
+    dv2_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        GK: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_dh_shared = T.alloc_shared((DK, block_DV), dtype=output_dtype)
+            b_dh_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_1 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_2 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            dv_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            GK_last_shared = T.alloc_shared((DK,), dtype=gate_dtype)
+
+            if use_final_state_gradient:
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
+                T.copy(b_dh_shared, b_dh_fragment)
+            else:
+                T.clear(b_dh_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # The gradient should be stored in the reverse order
+                i_s_inv = T.ceildiv(S, block_S) - i_s - 1  # reverse indices
+                # Store the updated dh
+                T.copy(b_dh_fragment, b_dh_shared)
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+                # Update dv
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
+                T.copy(
+                    dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared
+                )  # copy old dv
+                T.copy(dv_shared, dv_fragment_2)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
+                # Store the updated dv
+                T.copy(dv_fragment, dv_shared)
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+                # Update dh
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)  # [block_S, DK]
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)  # [block_S, DK]
+                T.copy(
+                    dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared
+                )  # [block_S, block_DV]
+
+                if use_gk:
+                    last_idx = T.min((i_s_inv + 1) * block_S, S) - 1  # chunk last token gk
+                    T.copy(GK[bb, last_idx, bh, :], GK_last_shared)
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_dh_fragment[i_k, i_v] *= T.exp2(GK_last_shared[i_k])
+
+                T.gemm(Q_shared, dO_shared, b_dh_fragment_1, transpose_A=True, clear_accum=True)  # [DK, block_DV]
+
+                # dv_shared: [block_S, block_DV]
+                T.gemm(W_shared, dv_shared, b_dh_fragment_2, transpose_A=True, clear_accum=True)  # [DK, block_DV]
+                for i_k, i_v in T.Parallel(DK, block_DV):
+                    b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] * scale - b_dh_fragment_2[i_k, i_v]
+
+            if use_initial_state:
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_gk=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+    use_torch=False,
+):
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+
+    # fla ref
+    print("fla running...", flush=True)
+    if use_gk:
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(
+            q=Q, k=K, w=W, do=dO, dv=dv, gk=G, h0=h0, dht=dht, scale=scale, use_exp2=True
+        )
+
+    # tilelang
+    print("tilelang running...", flush=True)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_gk,
+        use_initial_state,
+        use_final_state_gradient,
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
+
+    fla_time = do_bench(
+        chunk_gated_delta_rule_bwd_dhu, q=Q, k=K, w=W, do=dO, dv=dv, gk=G, h0=h0, dht=dht, scale=scale, chunk_size=chunk_size
+    )
+    tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
+
+    print(f"fla time: {fla_time} ms")
+    print(f"tilelang time: {tilelang_time} ms")
+
+    compare_tensors("dh", dh_ref, dh_tilelang)
+    compare_tensors("dh0", dh0_ref, dh0_tilelang)
+    compare_tensors("dv2", dv2_ref, dv2_tilelang)
+
+
+def main():
+    DK = 128
+    run_test(
+        B=1,
+        S=1024 * 8,
+        H=64,
+        DK=DK,
+        DV=128,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        scale=DK**-0.5,
+        use_gk=True,
+        use_initial_state=True,
+        use_final_state_gradient=True,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+        use_torch=False,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_delta_h_fwd.py b/examples/kda/chunk_delta_h_fwd.py
new file mode 100644
index 000000000..fbb8bd988
--- /dev/null
+++ b/examples/kda/chunk_delta_h_fwd.py
@@ -0,0 +1,306 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+
+import sys  # noqa: F401
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/your/path/to/flash-linear-attention")
+
+from FLA_KDA.fla_chunk_delta import chunk_gated_delta_rule_fwd_h
+from FLA_KDA.cumsum import chunk_local_cumsum
+
+import torch
+import torch.nn.functional as F
+
+from test_utils_kda import compare_tensors, do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    W = F.normalize(W, dim=-1, p=2)
+    U = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    U = F.normalize(U, dim=-1, p=2)
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    G = chunk_local_cumsum(G, chunk_size)
+    initial_state = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    return K, W, U, G, initial_state
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    state_dtype,
+):
+    BS = (S + chunk_size - 1) // chunk_size  # ceildiv to match kernel iteration
+    h = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    final_state = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    V_new = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return h, final_state, V_new
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_gated_delta_rule_fwd_h(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_gk,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
+    # kernel config
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
+):
+    block_S = chunk_size
+    BS = (S + chunk_size - 1) // chunk_size  # ceildiv to match kernel iteration
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    W_shape = (B, S, H, DK)
+    U_shape = (B, S, H, DV)
+    GK_shape = (B, S, H, DK)
+    h_shape = (B, BS, H, DK, DV)
+    initial_state_shape = (B, H, DK, DV)
+    final_state_shape = (B, H, DK, DV)
+
+    @T.prim_func
+    def kernel(
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_h_shared = T.alloc_shared((DK, block_DV), dtype=input_dtype)
+            b_h_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            V_new_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            V_new_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            GK_last_shared = T.alloc_shared((DK), dtype=gate_dtype)
+
+            if use_initial_state:
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
+                T.copy(b_h_shared, b_h_fragment)
+            else:
+                T.clear(b_h_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # Store previous result to the hidden tensor, like the epilogue
+                T.copy(b_h_shared, h[bb, i_s, bh, :, bv * block_DV : (bv + 1) * block_DV])
+
+                # Recurrence
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, :], W_shared)
+                T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
+
+                # U - W * S
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
+                T.copy(U_shared, U_fragment)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
+
+                # Save V_new
+                if save_new_value:
+                    T.copy(V_new_fragment, dst=V_new_shared)
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
+                # use_gk
+                if use_gk:
+                    T.copy(GK[bb, (i_s + 1) * block_S - 1, bh, :], GK_last_shared)  # block last token
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_h_fragment[i_k, i_v] *= T.exp2(GK_last_shared[i_k])
+
+                # Update intermediate results
+                T.copy(V_new_fragment, V_new_shared)
+                T.gemm(K_shared, V_new_shared, b_h_fragment, transpose_A=True)
+
+                T.copy(b_h_fragment, b_h_shared)
+
+            # Save final state
+            if store_final_state:
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_gk=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+
+    # fla ref
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        gk=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+        use_exp2=True,
+    )
+
+    # tilelang
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_gk,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
+
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        gk=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+        use_exp2=True,
+    )
+    tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
+
+    # check correctness
+    compare_tensors("h", h_ref, h_tilelang)
+    compare_tensors("final_state", final_state_ref, final_state_tilelang)
+    compare_tensors("V_new", V_new_ref, V_new_tilelang)
+
+    print(f"tilelang time: {tilelang_time} ms")
+    print(f"fla time: {fla_time} ms")
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        input_dtype="float16",
+        output_dtype="float16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        use_gk=True,
+        use_initial_state=True,
+        store_final_state=True,
+        save_new_value=True,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_inter_solve_fused.py b/examples/kda/chunk_inter_solve_fused.py
new file mode 100644
index 000000000..940dc20c8
--- /dev/null
+++ b/examples/kda/chunk_inter_solve_fused.py
@@ -0,0 +1,566 @@
+import tilelang
+import tilelang.language as T
+
+from FLA_KDA.fla_chunk_intra import chunk_kda_fwd_inter_solve_fused
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+import torch.nn.functional as F
+
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    sub_chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    gk = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()  # 需要是cumsum
+    gk = F.logsigmoid(gk)
+    gk = chunk_local_cumsum(gk, chunk_size)
+
+    Aqk = torch.empty(B, S, H, chunk_size, dtype=input_dtype).cuda()
+    Akk_diag = torch.ones(B, S, H, sub_chunk_size, dtype=torch.float32).cuda()
+
+    return q, k, gk, beta, Aqk, Akk_diag
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    chunk_size,
+    sub_chunk_size,
+    output_dtype,
+):
+    Akk = torch.empty(B, S, H, chunk_size, dtype=output_dtype).cuda()
+    return Akk
+
+
+@tilelang.jit(out_idx=[-2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_kda_fwd_inter_fused(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+    scale,
+    block_DK=32,
+    threads=32,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    BC = sub_chunk_size
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    GK_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    Aqk_shape = (B, S, H, BS)
+    Akk_diag_shape = (B, S, H, BC)
+    """
+    Fused kernel: compute inter-subchunk Akk + solve_tril in one pass.
+    Prerequisite: token_parallel has already computed diagonal Akk blocks in Akk_diag.
+
+    This kernel:
+    1. Computes off-diagonal Aqk blocks -> writes to global
+    2. Computes off-diagonal Akk blocks -> keeps in registers
+    3. Loads diagonal Akk blocks from Akk_diag (fp32)
+    4. Does forward substitution on diagonals
+    5. Computes merged Akk_inv
+    6. Writes Akk_inv to Akk
+    """
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        Akk_diag: T.Tensor(Akk_diag_shape, dtype=T.float32),
+        Aqk: T.Tensor(Aqk_shape, dtype=output_dtype),
+        Akk: T.Tensor(Aqk_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            Aqk10_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk10_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk20_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk20_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk21_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk21_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk30_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk30_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk31_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk31_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk32_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk32_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk10_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk20_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk21_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk30_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk31_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk32_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            K0_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK0_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            Q_GK_scaled_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K_GK_scaled_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_kt_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            b_gn1_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            b_gn2_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            b_gn3_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+
+            b_gqn1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_gqn2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_gqn3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            beta_1_shared = T.alloc_shared((BC,), dtype=T.float32)
+            beta_2_shared = T.alloc_shared((BC,), dtype=T.float32)
+            beta_3_shared = T.alloc_shared((BC,), dtype=T.float32)
+            # Akk_inv
+            Ai_00_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_10_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_11_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_20_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_21_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_22_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_30_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_31_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_32_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_33_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            T.clear(Aqk10_fragment)
+            T.clear(Akk10_fragment)
+            T.clear(Aqk20_fragment)
+            T.clear(Akk20_fragment)
+            T.clear(Aqk21_fragment)
+            T.clear(Akk21_fragment)
+            T.clear(Aqk30_fragment)
+            T.clear(Akk30_fragment)
+            T.clear(Aqk31_fragment)
+            T.clear(Akk31_fragment)
+            T.clear(Aqk32_fragment)
+            T.clear(Akk32_fragment)
+
+            i_tc0 = bs * BS
+            i_tc1 = bs * BS + BC
+            i_tc2 = bs * BS + 2 * BC
+            i_tc3 = bs * BS + 3 * BC
+
+            ################################################################################
+            # 1. off-diagonal blocks
+            ################################################################################
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * BS : bs * BS + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K0_shared)
+                T.copy(GK[bb, bs * BS : bs * BS + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK0_shared)
+                if i_tc1 < S:
+                    T.copy(Q[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q1_shared)
+                    T.copy(K[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K1_shared)
+                    T.copy(GK[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK1_shared)
+                    T.copy(GK[bb, i_tc1, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn1_shared)  # subblock第一个token的GK
+                    for i_c1, i_k1 in T.Parallel(BC, block_DK):
+                        b_gqn1_shared[i_c1, i_k1] = T.if_then_else(
+                            i_tc1 + i_c1 < S, T.exp2(GK1_shared[i_c1, i_k1] - b_gn1_shared[i_k1]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c1, i_k1] = Q1_shared[i_c1, i_k1] * b_gqn1_shared[i_c1, i_k1]
+                        K_GK_scaled_shared[i_c1, i_k1] = K1_shared[i_c1, i_k1] * b_gqn1_shared[i_c1, i_k1]
+                        b_kt_shared[i_c1, i_k1] = K0_shared[i_c1, i_k1] * T.exp2(b_gn1_shared[i_k1] - GK0_shared[i_c1, i_k1])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk10_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk10_fragment, transpose_B=True)
+                if i_tc2 < S:
+                    T.copy(Q[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q2_shared)
+                    T.copy(K[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K2_shared)
+                    T.copy(GK[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK2_shared)
+                    T.copy(GK[bb, i_tc2, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn2_shared)
+                    for i_c2, i_k2 in T.Parallel(BC, block_DK):
+                        b_gqn2_shared[i_c2, i_k2] = T.if_then_else(
+                            i_tc2 + i_c2 < S, T.exp2(GK2_shared[i_c2, i_k2] - b_gn2_shared[i_k2]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c2, i_k2] = Q2_shared[i_c2, i_k2] * b_gqn2_shared[i_c2, i_k2]
+                        K_GK_scaled_shared[i_c2, i_k2] = K2_shared[i_c2, i_k2] * b_gqn2_shared[i_c2, i_k2]
+                        b_kt_shared[i_c2, i_k2] = K0_shared[i_c2, i_k2] * T.exp2(b_gn2_shared[i_k2] - GK0_shared[i_c2, i_k2])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk20_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk20_fragment, transpose_B=True)
+                    for i_c3, i_k3 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c3, i_k3] = K1_shared[i_c3, i_k3] * T.exp2(b_gn2_shared[i_k3] - GK1_shared[i_c3, i_k3])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk21_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk21_fragment, transpose_B=True)
+                if i_tc3 < S:
+                    T.copy(Q[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q3_shared)
+                    T.copy(K[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K3_shared)
+                    T.copy(GK[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK3_shared)
+                    T.copy(GK[bb, i_tc3, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn3_shared)
+                    for i_c4, i_k4 in T.Parallel(BC, block_DK):
+                        b_gqn3_shared[i_c4, i_k4] = T.if_then_else(
+                            i_tc3 + i_c4 < S, T.exp2(GK3_shared[i_c4, i_k4] - b_gn3_shared[i_k4]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c4, i_k4] = Q3_shared[i_c4, i_k4] * b_gqn3_shared[i_c4, i_k4]
+                        K_GK_scaled_shared[i_c4, i_k4] = K3_shared[i_c4, i_k4] * b_gqn3_shared[i_c4, i_k4]
+                        b_kt_shared[i_c4, i_k4] = K0_shared[i_c4, i_k4] * T.exp2(b_gn3_shared[i_k4] - GK0_shared[i_c4, i_k4])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk30_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk30_fragment, transpose_B=True)
+                    for i_c5, i_k5 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c5, i_k5] = K1_shared[i_c5, i_k5] * T.exp2(b_gn3_shared[i_k5] - GK1_shared[i_c5, i_k5])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk31_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk31_fragment, transpose_B=True)
+                    for i_c6, i_k6 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c6, i_k6] = K2_shared[i_c6, i_k6] * T.exp2(b_gn3_shared[i_k6] - GK2_shared[i_c6, i_k6])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk32_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk32_fragment, transpose_B=True)
+
+            ################################################################################
+            # 2. save off-diagonal Aqk blocks and prepare Akk
+            ################################################################################
+
+            if i_tc1 < S:
+                T.copy(Beta[bb, i_tc1 : i_tc1 + BC, bh], beta_1_shared)
+                for i_c21, i_c22 in T.Parallel(BC, BC):
+                    Aqk10_fragment[i_c21, i_c22] = Aqk10_fragment[i_c21, i_c22] * scale
+                    Akk10_fragment[i_c21, i_c22] = Akk10_fragment[i_c21, i_c22] * beta_1_shared[i_c21]
+                T.copy(Aqk10_fragment, Aqk[bb, i_tc1 : i_tc1 + BC, bh, 0:BC])
+                T.copy(Akk10_fragment, Akk10_shared)
+            if i_tc2 < S:
+                T.copy(Beta[bb, i_tc2 : i_tc2 + BC, bh], beta_2_shared)
+                for i_c23, i_c24 in T.Parallel(BC, BC):
+                    Aqk20_fragment[i_c23, i_c24] = Aqk20_fragment[i_c23, i_c24] * scale
+                    Aqk21_fragment[i_c23, i_c24] = Aqk21_fragment[i_c23, i_c24] * scale
+                    Akk20_fragment[i_c23, i_c24] = Akk20_fragment[i_c23, i_c24] * beta_2_shared[i_c23]
+                    Akk21_fragment[i_c23, i_c24] = Akk21_fragment[i_c23, i_c24] * beta_2_shared[i_c23]
+                T.copy(Aqk20_fragment, Aqk[bb, i_tc2 : i_tc2 + BC, bh, 0:BC])
+                T.copy(Aqk21_fragment, Aqk[bb, i_tc2 : i_tc2 + BC, bh, BC : 2 * BC])
+                T.copy(Akk20_fragment, Akk20_shared)
+                T.copy(Akk21_fragment, Akk21_shared)
+            if i_tc3 < S:
+                T.copy(Beta[bb, i_tc3 : i_tc3 + BC, bh], beta_3_shared)
+                for i_c25, i_c26 in T.Parallel(BC, BC):
+                    Aqk30_fragment[i_c25, i_c26] = Aqk30_fragment[i_c25, i_c26] * scale
+                    Aqk31_fragment[i_c25, i_c26] = Aqk31_fragment[i_c25, i_c26] * scale
+                    Aqk32_fragment[i_c25, i_c26] = Aqk32_fragment[i_c25, i_c26] * scale
+                    Akk30_fragment[i_c25, i_c26] = Akk30_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                    Akk31_fragment[i_c25, i_c26] = Akk31_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                    Akk32_fragment[i_c25, i_c26] = Akk32_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                T.copy(Aqk30_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, 0:BC])
+                T.copy(Aqk31_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, BC : 2 * BC])
+                T.copy(Aqk32_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, 2 * BC : 3 * BC])
+                T.copy(Akk30_fragment, Akk30_shared)
+                T.copy(Akk31_fragment, Akk31_shared)
+                T.copy(Akk32_fragment, Akk32_shared)
+
+            ################################################################################
+            # 3. load diagonal Akk blocks
+            ################################################################################
+
+            T.copy(Akk_diag[bb, i_tc0 : i_tc0 + BC, bh, :], Ai_00_shared)
+            T.copy(Akk_diag[bb, i_tc1 : i_tc1 + BC, bh, :], Ai_11_shared)
+            T.copy(Akk_diag[bb, i_tc2 : i_tc2 + BC, bh, :], Ai_22_shared)
+            T.copy(Akk_diag[bb, i_tc3 : i_tc3 + BC, bh, :], Ai_33_shared)
+            for i_c1, i_c2 in T.Parallel(BC, BC):
+                Ai_00_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_00_shared[i_c1, i_c2], 0)
+                Ai_11_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_11_shared[i_c1, i_c2], 0)
+                Ai_22_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_22_shared[i_c1, i_c2], 0)
+                Ai_33_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_33_shared[i_c1, i_c2], 0)
+
+            ################################################################################
+            # 4. forward substitution on diagonals
+            ################################################################################
+            a_00_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            reduce_shared = T.alloc_shared((BC,), dtype=T.float32)
+            for i_i in T.Pipelined(2, T.min(BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_00_shared)  # load row
+                for i_c in T.Parallel(BC):
+                    a_00_shared[i_c] = T.if_then_else(i_c < i_i, -a_00_shared[i_c], 0.0)  # mask:i_c<i_i
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa_mul_shared[i_c2, i_c3] = a_00_shared[i_c2] * Ai_00_shared[i_c2, i_c3]
+                T.reduce_sum(Aa_mul_shared, reduce_shared, dim=0, clear=True)
+                for i_c4 in T.Parallel(BC):
+                    a_00_shared[i_c4] += reduce_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_00_shared[i_c5, i_c6] = T.if_then_else(i_c5 == i_i, a_00_shared[i_c6], Ai_00_shared[i_c5, i_c6])
+
+            a_11_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa11_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(BC + 2, T.min(2 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_11_shared)
+                for i_c in T.Parallel(BC):
+                    a_11_shared[i_c] = T.if_then_else(i_c < i_i - BC, -a_11_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa11_mul_shared[i_c2, i_c3] = a_11_shared[i_c2] * Ai_11_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa11_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_11_shared[i_c4] = reduce_shared[i_c4] + a_11_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_11_shared[i_c5, i_c6] = T.if_then_else(
+                        i_c5 == (i_i - BC),
+                        a_11_shared[i_c6],
+                        Ai_11_shared[i_c5, i_c6],
+                    )
+
+            a_22_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa22_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(2 * BC + 2, T.min(3 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_22_shared)
+                for i_c in T.Parallel(BC):
+                    a_22_shared[i_c] = T.if_then_else(i_c < i_i - 2 * BC, -a_22_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa22_mul_shared[i_c2, i_c3] = a_22_shared[i_c2] * Ai_22_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa22_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_22_shared[i_c4] = reduce_shared[i_c4] + a_22_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_22_shared[i_c5, i_c6] = T.if_then_else(i_c5 == (i_i - 2 * BC), a_22_shared[i_c6], Ai_22_shared[i_c5, i_c6])
+
+            a_33_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa33_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(3 * BC + 2, T.min(4 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_33_shared)
+                for i_c in T.Parallel(BC):
+                    a_33_shared[i_c] = T.if_then_else(i_c < i_i - 3 * BC, -a_33_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa33_mul_shared[i_c2, i_c3] = a_33_shared[i_c2] * Ai_33_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa33_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_33_shared[i_c4] = reduce_shared[i_c4] + a_33_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_33_shared[i_c5, i_c6] = T.if_then_else(
+                        i_c5 == (i_i - 3 * BC),
+                        a_33_shared[i_c6],
+                        Ai_33_shared[i_c5, i_c6],
+                    )
+
+            for i, j in T.Parallel(BC, BC):
+                Ai_00_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_11_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_22_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_33_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+
+            ################################################################################
+            # 5. compute merged inverse using off-diagonals
+            ################################################################################
+
+            Ai_10_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_10_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_21_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_21_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_32_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_32_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_10_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_21_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_32_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            # ---------- Ai_10 = - (Ai11@Akk10)@Ai00 ----------
+            T.gemm(Ai_11_shared, Akk10_shared, Ai_10_inv_frag, clear_accum=True)  # [BC, BC] * [BC, BC]
+            T.copy(Ai_10_inv_frag, Ai_10_inv_shared)
+            T.gemm(Ai_10_inv_shared, Ai_00_shared, Ai_10_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_10_final_frag[i_bc, j_bc] = -Ai_10_final_frag[i_bc, j_bc]
+            T.copy(Ai_10_final_frag, Ai_10_shared)
+            # ---------- Ai_21 = - (Ai22@Akk21)@Ai11 ----------
+            T.gemm(Ai_22_shared, Akk21_shared, Ai_21_inv_frag, clear_accum=True)
+            T.copy(Ai_21_inv_frag, Ai_21_inv_shared)
+            T.gemm(Ai_21_inv_shared, Ai_11_shared, Ai_21_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_21_final_frag[i_bc, j_bc] = -Ai_21_final_frag[i_bc, j_bc]
+            T.copy(Ai_21_final_frag, Ai_21_shared)
+            # ---------- Ai_32 = - (Ai33@Akk32)@Ai22 ----------
+            T.gemm(Ai_33_shared, Akk32_shared, Ai_32_inv_frag, clear_accum=True)
+            T.copy(Ai_32_inv_frag, Ai_32_inv_shared)
+            T.gemm(Ai_32_inv_shared, Ai_22_shared, Ai_32_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_32_final_frag[i_bc, j_bc] = -Ai_32_final_frag[i_bc, j_bc]
+            T.copy(Ai_32_final_frag, Ai_32_shared)
+
+            # ---------- Ai_20 = - Ai_22 @ ( Akk20@Ai00 + Akk21@Ai10 ) ----------
+            Ai20_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk20 @ Ai00
+            Ai20_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk21 @ Ai10
+            Ai20_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)  # t0 + t1
+            Ai20_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+
+            T.gemm(Akk20_shared, Ai_00_shared, Ai20_t0_frag, clear_accum=True)
+            T.gemm(Akk21_shared, Ai_10_shared, Ai20_t1_frag, clear_accum=True)
+
+            # sum = t0 + t1
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai20_sum_shared[i_bc, j_bc] = Ai20_t0_frag[i_bc, j_bc] + Ai20_t1_frag[i_bc, j_bc]
+
+            # final = Ai_22 @ sum
+            T.gemm(Ai_22_shared, Ai20_sum_shared, Ai20_final_frag, clear_accum=True)
+
+            # negate
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai20_final_frag[i_bc, j_bc] = -Ai20_final_frag[i_bc, j_bc]
+
+            T.copy(Ai20_final_frag, Ai_20_shared)
+
+            # ---------- Ai_31 = - Ai_33 @ ( Akk31@Ai11 + Akk32@Ai21 ) ----------
+            Ai31_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk31 @ Ai11
+            Ai31_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk32 @ Ai21
+            Ai31_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai31_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            T.gemm(Akk31_shared, Ai_11_shared, Ai31_t0_frag, clear_accum=True)
+            T.gemm(Akk32_shared, Ai_21_shared, Ai31_t1_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai31_sum_shared[i_bc, j_bc] = Ai31_t0_frag[i_bc, j_bc] + Ai31_t1_frag[i_bc, j_bc]
+            T.gemm(Ai_33_shared, Ai31_sum_shared, Ai31_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai31_final_frag[i_bc, j_bc] = -Ai31_final_frag[i_bc, j_bc]
+            T.copy(Ai31_final_frag, Ai_31_shared)
+
+            # ---------- Ai_30 = - Ai_33 @ ( Akk30@Ai00 + Akk31@Ai10 + Akk32@Ai20 ) ----------
+            Ai30_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk30 @ Ai00
+            Ai30_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk31 @ Ai10
+            Ai30_t2_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk32 @ Ai20
+            Ai30_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai30_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            T.gemm(Akk30_shared, Ai_00_shared, Ai30_t0_frag, clear_accum=True)
+            T.gemm(Akk31_shared, Ai_10_shared, Ai30_t1_frag, clear_accum=True)
+            T.gemm(Akk32_shared, Ai_20_shared, Ai30_t2_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai30_sum_shared[i_bc, j_bc] = Ai30_t0_frag[i_bc, j_bc] + Ai30_t1_frag[i_bc, j_bc] + Ai30_t2_frag[i_bc, j_bc]
+            T.gemm(Ai_33_shared, Ai30_sum_shared, Ai30_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai30_final_frag[i_bc, j_bc] = -Ai30_final_frag[i_bc, j_bc]
+            T.copy(Ai30_final_frag, Ai_30_shared)
+
+            T.copy(Ai_00_shared, Akk[bb, i_tc0 : i_tc0 + BC, bh, 0:BC])
+            T.copy(Ai_10_shared, Akk[bb, i_tc1 : i_tc1 + BC, bh, 0:BC])
+            T.copy(Ai_11_shared, Akk[bb, i_tc1 : i_tc1 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_20_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, 0:BC])
+            T.copy(Ai_21_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_22_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, 2 * BC : 3 * BC])
+            T.copy(Ai_30_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 0:BC])
+            T.copy(Ai_31_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_32_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 2 * BC : 3 * BC])
+            T.copy(Ai_33_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 3 * BC : 4 * BC])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    scale,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+):
+    q, k, gk, beta, Aqk, Akk_diag = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        sub_chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    Aqk_ref = Aqk.clone()
+    Akk_ref = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    chunk_kda_fwd_inter_solve_fused(
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk_diag=Akk_diag,
+        Akk=Akk_ref,
+        scale=scale,
+    )
+    Aqk_tilelang = Aqk.clone()
+    Akk_tilelang = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    kernel = tilelang_chunk_kda_fwd_inter_fused(
+        B, S, H, DK, input_dtype, output_dtype, accum_dtype, gate_dtype, chunk_size, sub_chunk_size, scale
+    )
+    Aqk_tilelang, Akk_tilelang = kernel(
+        q,
+        k,
+        gk,
+        beta,
+        Akk_diag,
+    )
+
+    compare_tensors("Aqk", Aqk_ref, Aqk_tilelang)
+    compare_tensors("Akk", Akk_ref, Akk_tilelang)
+    fla_time = do_bench(
+        chunk_kda_fwd_inter_solve_fused,
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk_diag=Akk_diag,
+        Akk=Akk_ref,
+        scale=scale,
+    )
+    tilelang_time = do_bench(kernel, q, k, gk, beta, Akk_diag)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        chunk_size=64,
+        sub_chunk_size=16,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_intra_token_parallel.py b/examples/kda/chunk_intra_token_parallel.py
new file mode 100644
index 000000000..cc024e7a9
--- /dev/null
+++ b/examples/kda/chunk_intra_token_parallel.py
@@ -0,0 +1,273 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import torch
+import torch.nn.functional as F
+from FLA_KDA.fla_chunk_intra_token_parallel import chunk_kda_fwd_intra_token_parallel
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    gk = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    gk = F.logsigmoid(gk)
+    gk = chunk_local_cumsum(gk, chunk_size)
+    return q, k, gk, beta
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    chunk_size,
+    sub_chunk_size,
+    output_dtype,
+):
+    Aqk = torch.empty(B, S, H, chunk_size, dtype=output_dtype).cuda()
+    Akk = torch.empty(B, S, H, sub_chunk_size, dtype=output_dtype).cuda()
+    return Aqk, Akk
+
+
+def get_configs():
+    import itertools
+
+    block_H = [1, 2, 4, 8]
+    threads = [128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_H, threads, num_stages))
+
+    configs = [{"block_H": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_kda_fwd_intra_token_parallel(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+    block_H=1,
+    threads=32,
+    num_stages=1,
+):
+    CS = chunk_size
+    SCS = sub_chunk_size
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    GK_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    Aqk_shape = (B, S, H, CS)
+    Akk_shape = (B, S, H, SCS)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        Aqk: T.Tensor(Aqk_shape, dtype=output_dtype),
+        Akk: T.Tensor(Akk_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(B * S, T.ceildiv(H, block_H), threads=threads) as (bbs, bh):  # block_index_bs, block_index_dh
+            bb, bs = bbs // S, bbs % S
+            i_c = bs // CS  # indice chunk
+            i_s = (bs % CS) // SCS  # indice subchunk
+            i_tc = i_c * CS
+            i_ts = i_tc + i_s * SCS
+            loops = bs + 1 - i_ts
+
+            Q_i_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            K_i_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            GK_i_shared = T.alloc_shared((block_H, DK), dtype=gate_dtype)
+            Beta_shared = T.alloc_shared(
+                (block_H,),
+                dtype=input_dtype,
+            )
+            K_j_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            GK_j_shared = T.alloc_shared((block_H, DK), dtype=gate_dtype)
+            Aqk_shared = T.alloc_shared((block_H, DK), dtype=accum_dtype)
+            Akk_shared = T.alloc_shared((block_H, DK), dtype=accum_dtype)
+            Sum_Aqk_shared = T.alloc_shared((block_H, CS), dtype=output_dtype)
+            Sum_Akk_shared = T.alloc_shared((block_H, SCS), dtype=output_dtype)
+
+            Q_i_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=input_dtype,
+            )
+            K_i_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=input_dtype,
+            )
+            K_j_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=accum_dtype,
+            )
+
+            Sum_Aqk_fragment = T.alloc_fragment(
+                (block_H,),
+                dtype=accum_dtype,
+            )
+            Sum_Akk_fragment = T.alloc_fragment(
+                (block_H,),
+                dtype=accum_dtype,
+            )
+
+            T.copy(Q[bb, bs, bh * block_H : (bh + 1) * block_H, :], Q_i_shared)
+            T.copy(K[bb, bs, bh * block_H : (bh + 1) * block_H, :], K_i_shared)
+            T.copy(GK[bb, bs, bh * block_H : (bh + 1) * block_H, :], GK_i_shared)  # TMA
+
+            T.disable_warp_group_reg_alloc()
+            for i_h in T.Parallel(block_H):  # cannot use TMA
+                Beta_shared[i_h] = Beta[bb, bs, bh * block_H + i_h]
+
+            for i_h, i_k in T.Parallel(block_H, DK):
+                K_i_fragment[i_h, i_k] = K_i_shared[i_h, i_k] * Beta_shared[i_h]
+                Q_i_fragment[i_h, i_k] = Q_i_shared[i_h, i_k]
+
+            T.clear(Sum_Akk_shared)
+            T.clear(Sum_Aqk_shared)
+
+            for d in T.Pipelined(loops, num_stages=num_stages):
+                j = d + i_ts
+                T.copy(K[bb, j, bh * block_H : (bh + 1) * block_H, :], K_j_shared)
+                T.copy(GK[bb, j, bh * block_H : (bh + 1) * block_H, :], GK_j_shared)
+                # T.copy(K_j_shared, K_j_fragment)
+                for i_h, i_k in T.Parallel(block_H, DK):
+                    K_j_fragment[i_h, i_k] = K_j_shared[i_h, i_k] * T.exp2(GK_i_shared[i_h, i_k] - GK_j_shared[i_h, i_k])
+                    Aqk_shared[i_h, i_k] = Q_i_fragment[i_h, i_k] * K_j_fragment[i_h, i_k]
+                    Akk_shared[i_h, i_k] = K_i_fragment[i_h, i_k] * K_j_fragment[i_h, i_k]
+
+                T.reduce_sum(Aqk_shared, Sum_Aqk_fragment, dim=-1, clear=True)
+                T.reduce_sum(Akk_shared, Sum_Akk_fragment, dim=-1, clear=True)
+
+                T.copy(Sum_Aqk_fragment, Sum_Aqk_shared[:, j % CS])
+
+                if j < bs:
+                    T.copy(Sum_Akk_fragment, Sum_Akk_shared[:, d])
+
+            T.copy(Sum_Aqk_shared, Aqk[bb, bs, bh * block_H : (bh + 1) * block_H, :])
+            T.copy(Sum_Akk_shared, Akk[bb, bs, bh * block_H : (bh + 1) * block_H, :])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    scale,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+):
+    q, k, gk, beta = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    Aqk_ref, Akk_ref = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    Aqk_tilelang, Akk_tilelang = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+
+    Aqk_ref, Akk_ref = chunk_kda_fwd_intra_token_parallel(
+        q=q, k=k, gk=gk, beta=beta, Aqk=Aqk_ref, Akk=Akk_ref, scale=scale, chunk_size=chunk_size, sub_chunk_size=sub_chunk_size
+    )
+
+    kernel = tilelang_chunk_kda_fwd_intra_token_parallel(
+        B,
+        S,
+        H,
+        DK,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        sub_chunk_size,
+    )
+    # kernel_source  = kernel.get_kernel_source()
+    # print(kernel_source)
+    # exit()
+    # # scale 如何传值
+    # r = torch.cuda.nvtx.range_start("TILELANG_KDA")
+    Aqk_tilelang, Akk_tilelang = kernel(
+        q,
+        k,
+        gk,
+        beta,
+    )
+    # torch.cuda.nvtx.range_end(r)
+
+    fla_time = do_bench(
+        chunk_kda_fwd_intra_token_parallel,
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk=Akk_ref,
+        scale=scale,
+        chunk_size=chunk_size,
+        sub_chunk_size=sub_chunk_size,
+    )
+    tilelang_time = do_bench(
+        kernel,
+        q,
+        k,
+        gk,
+        beta,
+    )
+
+    print(f"fla time: {fla_time} ms")
+    print(f"tilelang time: {tilelang_time} ms")
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        chunk_size=64,
+        sub_chunk_size=16,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_o.py b/examples/kda/chunk_o.py
new file mode 100644
index 000000000..3d92d9a4c
--- /dev/null
+++ b/examples/kda/chunk_o.py
@@ -0,0 +1,242 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_o import chunk_gla_fwd_o_gk
+from test_utils_kda import compare_tensors
+
+import torch
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    BS = chunk_size
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    HIDDEN = torch.randn(B, S // BS, H, DK, DV, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    return Q, V, G, A, HIDDEN
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+):
+    O = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return O
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-1])
+def tilelang_chunk_fwd_o(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    scale,
+    # kernel config
+    block_S=64,
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    Q_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+    V_shape = (B, S, H, DV)
+    H_shape = (B, S // BS, H, DK, DV)
+    GK_shape = (B, S, H, DK)
+    O_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),  # type: ignore
+        V: T.Tensor(V_shape, dtype=input_dtype),  # type: ignore
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),  # type: ignore
+        A: T.Tensor(A_shape, dtype=input_dtype),  # type: ignore
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),  # type: ignore
+        O: T.Tensor(O_shape, dtype=output_dtype),  # type: ignore
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            HIDDEN_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            O_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            O_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            GK_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            GQ_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+
+            T.clear(O_fragment)
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)  # [block_S, block_DK]
+                T.copy(
+                    GK[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], GK_shared
+                )  # [block_S, block_DK]
+                T.copy(
+                    HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], HIDDEN_shared
+                )  # [block_DK, block_DV]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    Q_shared[i_s, i_k2] = Q_shared[i_s, i_k2] * scale
+                    GQ_shared[i_s, i_k2] = Q_shared[i_s, i_k2] * T.exp2(GK_shared[i_s, i_k2])
+                T.gemm(GQ_shared, HIDDEN_shared, O_fragment)  # O_fragment as accumulator
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)  # [block_S, block_DV]
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, 0:block_S], A_shared)  # [block_S, block_S]
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                A_shared[i_s1, i_s2] = T.if_then_else(i_s1 < i_s2, 0, A_shared[i_s1, i_s2])
+
+            T.gemm(
+                A_shared,
+                V_shared,
+                O_fragment,
+            )
+
+            T.copy(O_fragment, O_shared)
+
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    input_dtype_torch = getattr(torch, input_dtype)
+    output_dtype_torch = getattr(torch, output_dtype)
+    accum_dtype_torch = getattr(torch, accum_dtype)
+    gate_dtype_torch = getattr(torch, gate_dtype)
+    Q, V, G, A, HIDDEN = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
+    scale = 1.0 / DK**0.5
+    # scale = 1.0
+
+    O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    O_ref = chunk_gla_fwd_o_gk(Q, V, G, A, HIDDEN, scale, chunk_size=chunk_size, use_exp2=True)
+
+    block_S = chunk_size
+    O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        block_S,
+    )
+    O_tilelang = kernel(Q, V, G, A, HIDDEN)
+    compare_tensors("O", O_ref, O_tilelang)
+    fla_time = do_bench(chunk_gla_fwd_o_gk, Q, V, G, A, HIDDEN, scale, chunk_size=chunk_size, use_exp2=True)
+    tilelang_time = do_bench(kernel, Q, V, G, A, HIDDEN)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/test_utils_kda.py b/examples/kda/test_utils_kda.py
new file mode 100644
index 000000000..4d0240e4e
--- /dev/null
+++ b/examples/kda/test_utils_kda.py
@@ -0,0 +1,108 @@
+import torch
+
+RCP_LN2 = 1.4426950216
+
+
+def print_red_warning(message):
+    print(f"\033[31mWARNING: {message}\033[0m")
+
+
+def calc_sim(x, y, name="tensor"):
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print_red_warning(f"{name} all zero")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
+    x_mask = torch.isfinite(x)
+    y_mask = torch.isfinite(y)
+    if not torch.all(x_mask == y_mask):
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
+        if raise_assert:
+            raise AssertionError
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
+        if raise_assert:
+            raise AssertionError
+    x = x.masked_fill(~x_mask, 0)
+    y = y.masked_fill(~y_mask, 0)
+    sim = calc_sim(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print_red_warning(f"{name} Error: {diff}")
+        if raise_assert:
+            raise AssertionError
+    else:
+        print(f"{name} {data} passed")
+
+
+def compare_tensors(name, x, y, atol=1e-5, rtol=1e-5):
+    import numpy as np
+    import torch
+
+    diff = (x - y).abs()
+
+    # ========= Max Absolute Error =========
+    max_abs_err = diff.max().item()
+    abs_flat_idx = diff.argmax()
+    abs_idx = list(np.unravel_index(abs_flat_idx.cpu().numpy(), diff.shape))
+
+    # ========= Relative Error (NaN-safe) =========
+    denom = y.abs()
+    rel = torch.zeros_like(diff)
+    mask = denom > 0
+    rel[mask] = diff[mask] / denom[mask]
+
+    max_rel_err = rel.max().item()
+    rel_flat_idx = rel.argmax()
+    rel_idx = list(np.unravel_index(rel_flat_idx.cpu().numpy(), rel.shape))
+
+    # ========= Cross Error =========
+    abs_pos_rel_err = rel[tuple(abs_idx)].item()
+    rel_pos_abs_err = diff[tuple(rel_idx)].item()
+
+    # ========= Print =========
+    print(f"========== Compare: {name} ==========")
+
+    print(f"Max absolute error : {max_abs_err:.6e}")
+    print(f"  at index         : {abs_idx}")
+    print(f"  x[{abs_idx}] = {x[tuple(abs_idx)].item():.6e}")
+    print(f"  y[{abs_idx}] = {y[tuple(abs_idx)].item():.6e}")
+    print(f"  relative error   : {abs_pos_rel_err:.6e}")
+
+    print(f"\nMax relative error : {max_rel_err:.6e}")
+    print(f"  at index         : {rel_idx}")
+    print(f"  x[{rel_idx}] = {x[tuple(rel_idx)].item():.6e}")
+    print(f"  y[{rel_idx}] = {y[tuple(rel_idx)].item():.6e}")
+    print(f"  absolute error   : {rel_pos_abs_err:.6e}")
+
+    print("=====================================\n")
+
+
+def do_bench(fn, *args, warmup=20, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+    print(times)
+    return times.mean().item()
diff --git a/examples/kda/wy_fast.py b/examples/kda/wy_fast.py
new file mode 100644
index 000000000..2b1cd5fb0
--- /dev/null
+++ b/examples/kda/wy_fast.py
@@ -0,0 +1,231 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import torch
+
+from FLA_KDA.fla_wy_fast import recompute_w_u_fwd
+from test_utils_kda import compare_tensors, do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(B, S, H, DK, DV, chunk_size, input_dtype, output_dtype, gate_dtype=torch.float32):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    return K, V, Beta, G, A
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    use_qg,
+    use_kg,
+    output_dtype,
+):
+    W = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    U = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    QG = torch.empty(B, S, H, DK, dtype=output_dtype).cuda() if use_qg else None
+    KG = torch.empty(B, S, H, DK, dtype=output_dtype).cuda() if use_kg else None
+    return W, U, QG, KG
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64]
+    block_DV = [32, 64]
+    threads = [64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-4, -3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_recompute_w_u_fwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    chunk_size,
+    use_qg,
+    use_kg,
+    # kernel config
+    block_S=64,
+    block_DK=32,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    G_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
+        QG: T.Tensor(K_shape, dtype=output_dtype),
+        KG: T.Tensor(K_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype, scope="shared")
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=output_dtype)
+            W_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, block_DK), dtype=output_dtype)
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_n_shared = T.alloc_shared(block_DK, dtype=gate_dtype)
+            KG_shared = T.alloc_shared((block_S, block_DK), dtype=output_dtype)
+
+            T.disable_warp_group_reg_alloc()  # TMA to transfer the last dimension of the data should be 16 times
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
+                T.copy(U_fragment, U_shared)
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], G_shared)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * T.exp2(G_shared[i_s, i_k2])
+                T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
+                T.copy(W_fragment, W_shared)
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+                if use_kg:
+                    T.copy(G[bb, (bs + 1) * block_S - 1, bh, i_k * block_DK : (i_k + 1) * block_DK], G_n_shared)
+
+                    for i_s3, i_k3 in T.Parallel(block_S, block_DK):
+                        KG_shared[i_s3, i_k3] = K_shared[i_s3, i_k3] * T.exp2(G_n_shared[i_k3] - G_shared[i_s3, i_k3])
+                    T.copy(KG_shared, KG[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    use_qg = False
+    use_kg = True
+    K, V, Beta, G, A = prepare_input(
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
+    W_ref, U_ref, QG_ref, KG_ref = prepare_output(B, S, H, DK, DV, use_qg, use_kg, getattr(torch, output_dtype))
+    W_tilelang, U_tilelang, QG_tilelang, KG_tilelang = prepare_output(B, S, H, DK, DV, use_qg, use_kg, getattr(torch, output_dtype))
+
+    # reference
+    (
+        W_ref,
+        U_ref,
+        _,
+        KG_ref,
+    ) = recompute_w_u_fwd(
+        k=K,
+        v=V,
+        beta=Beta,
+        gk=G,
+        A=A,
+    )
+
+    block_S = chunk_size
+    kernel = tilelang_recompute_w_u_fwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        gate_dtype,
+        accum_dtype,
+        chunk_size,
+        use_qg,
+        use_kg,
+        block_S=block_S,
+    )
+    W_tilelang, U_tilelang, _, KG_tilelang = kernel(K, V, Beta, G, A)
+
+    tilelang_time = do_bench(kernel, K, V, Beta, G, A)
+    triton_time = do_bench(recompute_w_u_fwd, k=K, v=V, beta=Beta, gk=G, A=A)
+    print("tilelang time:", tilelang_time)
+    print("tritron time:", triton_time)
+
+    compare_tensors("W", W_ref, W_tilelang)
+    compare_tensors("U", U_ref, U_tilelang)
+    compare_tensors("KG", KG_ref, KG_tilelang)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
+        block_DK=64,
+        block_DV=32,
+        threads=128,
+        num_stages=3,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/wy_fast_bwd.py b/examples/kda/wy_fast_bwd.py
new file mode 100644
index 000000000..3a69b3162
--- /dev/null
+++ b/examples/kda/wy_fast_bwd.py
@@ -0,0 +1,350 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import sys  # noqa: F401
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+from FLA_KDA.fla_wy_fast import prepare_wy_repr_bwd
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+
+torch.random.manual_seed(0)
+torch.set_printoptions(profile="full")
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    GK = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    dw = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+
+    return K, V, Beta, GK, A, dw, dv, dk, dg
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dv = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    dbeta = torch.empty(B, S, H, dtype=output_dtype).cuda()
+    dg = torch.empty(B, S, H, DK, dtype=gate_dtype).cuda()
+    dA = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    return dk, dv, dbeta, dg, dA
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(
+    out_idx=[-5, -4, -3, -2, -1],
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
+def tilelang_wy_fast_bwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = block_S
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+    dw_shape = (B, S, H, DK)
+    du_shape = (B, S, H, DV)
+
+    dk_shape = (B, S, H, DK)
+    dv_shape = (B, S, H, DV)
+    dbeta_shape = (B, S, H)
+    dg_shape = (B, S, H, DK)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        GK: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=input_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk2: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg2: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared_beta_g = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            V_shared_beta = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype)
+            GK_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            GK_shared_exp = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            dw_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            du_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+
+            dk_old_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            dg_old_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            dA_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_fragment_tmp1 = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_fragment_tmp2 = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment_beta_g = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_beta = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dbeta_fragment = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpk = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpv = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dg_fragment = T.alloc_fragment((block_S, block_DK), dtype=gate_dtype)
+
+            T.clear(dA_fragment)
+            T.clear(dk_fragment)
+            T.clear(dk_fragment_beta_g)
+            T.clear(dv_fragment)
+            T.clear(dv_fragment_beta)
+            T.clear(dbeta_fragment)
+            T.clear(dg_fragment)
+
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)  # load A
+            T.copy(Beta[bb, bs * block_S : (bs + 1) * block_S, bh], Beta_shared)
+
+            # Update dk
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_old_shared)
+                T.copy(dg[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dg_old_shared)
+                T.copy(GK[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], GK_shared)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    GK_shared_exp[i_s, i_k2] = T.exp2(GK_shared[i_s, i_k2])
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * GK_shared_exp[i_s, i_k2]
+
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
+                T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True, clear_accum=False)
+                T.gemm(A_shared, dw_shared, dk_fragment_beta_g, transpose_A=True, clear_accum=True)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dk_fragment[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * GK_shared_exp[i_s, i_k2] * Beta_shared[i_s] + dk_old_shared[i_s, i_k2]
+                    )
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * GK_shared_exp[i_s, i_k2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment, dim=1, clear=False)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dg_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared_beta_g[i_s, i_k2] + dg_old_shared[i_s, i_k2]
+
+                # correct dk, dg
+                T.copy(dk_fragment, dk2[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+                T.copy(dg_fragment, dg2[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+            # Update dv
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
+                T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
+                T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * Beta_shared[i_s]
+
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment, dim=1, clear=False)
+
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+            T.copy(dbeta_fragment, dbeta[bb, bs * block_S : (bs + 1) * block_S, bh])
+
+            # correct dA
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_shared[i_s1, i_s2] = T.if_then_else(i_s1 > i_s2, dA_fragment[i_s1, i_s2], 0.0)
+            T.gemm(dA_shared, A_shared, dA_fragment_tmp1, transpose_B=True, clear_accum=True)
+            T.copy(dA_fragment_tmp1, dA_shared)
+            T.gemm(A_shared, dA_shared, dA_fragment_tmp2, transpose_A=True, clear_accum=True)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_fragment_tmp2[i_s1, i_s2] = T.if_then_else(i_s1 > i_s2, -dA_fragment_tmp2[i_s1, i_s2], 0.0)
+            T.copy(dA_fragment_tmp2, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    K, V, Beta, GK, A, dw, dv, dk, dg = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang, dA_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+
+    # ref
+    dk_ref, dv_ref, dbeta_ref, dg_ref, dA_ref = prepare_wy_repr_bwd(
+        k=K,
+        v=V,
+        gk=GK,
+        beta=Beta,
+        A=A,
+        dw=dw,
+        du=dv,
+        dk=dk,
+        dg=dg,
+    )
+
+    # # tilelang
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, GK, A, dw, dv, dk, dg)
+
+    compare_tensors("dA", dA_tilelang, dA_ref)
+    compare_tensors("dk", dk_tilelang, dk_ref)
+    compare_tensors("dv", dv_tilelang, dv_ref)
+    compare_tensors("dbeta", dbeta_tilelang, dbeta_ref)
+    compare_tensors("dg", dg_tilelang, dg_ref)
+    fla_time = do_bench(
+        prepare_wy_repr_bwd,
+        k=K,
+        v=V,
+        gk=GK,
+        beta=Beta,
+        A=A,
+        dw=dw,
+        du=dv,
+        dk=dk,
+        dg=dg,
+    )
+    tilelang_time = do_bench(kernel, K, V, Beta, GK, A, dw, dv, dk, dg)
+    print(f"FLA_time: {fla_time}")
+    print(f"TileLang_time: {tilelang_time}")
+
+
+def main():
+    DK = 128
+    DV = 128
+    run_test(
+        B=1,
+        S=32768,
+        H=8,
+        DK=DK,
+        DV=DV,
+        input_dtype=T.float32,
+        output_dtype=T.float32,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
+        chunk_size=64,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 568bcc55f..82ae1d982 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -13,20 +13,20 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -37,13 +37,13 @@ def tl_fused_chunk_bwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_bwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
@@ -66,11 +66,6 @@ def fused_chunk_linear_attn_bwd(
             dh = T.alloc_fragment([BK, BV], accum_dtype)
             dh_shared = T.alloc_shared([BK, BV], dtype)
 
-            T.annotate_layout({
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared)
-            })
             T.use_swizzle(10)
 
             T.clear(h)
@@ -78,10 +73,9 @@ def fused_chunk_linear_attn_bwd(
 
             # Calculate dQ
             for i in T.Pipelined(0, NT):
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
-                T.copy(dO[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                       do)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 T.gemm(do, v, ds, transpose_B=True, clear_accum=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -94,29 +88,19 @@ def fused_chunk_linear_attn_bwd(
                 for row, col in T.Parallel(chunk_size, BK):
                     dq[row, col] *= scale
                 T.copy(dq, dq_shared)
-                T.atomic_add(
-                    dQ[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK],
-                    dq_shared)
+                T.atomic_add(dQ[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dq_shared)
 
             # Calculate dK, dV (reversely)
             for i in T.Pipelined(1, NT + 1):
                 start = NT - i
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, start * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(
-                    K[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_k * BK:(i_k + 1) * BK], k)
-                T.copy(
-                    V[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_v * BV:(i_v + 1) * BV], v)
-                T.copy(
-                    dO[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], do)
+                T.copy(K[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 # Calculate dk
-                T.gemm(
-                    v, do, ds, transpose_B=True, clear_accum=True
-                )  # ds here actually means `s`, but we simply reuse the buffer `ds`
+                T.gemm(v, do, ds, transpose_B=True, clear_accum=True)  # ds here actually means `s`, but we simply reuse the buffer `ds`
                 for row, col in T.Parallel(chunk_size, chunk_size):
                     ds_shared[row, col] = T.if_then_else(row <= col, ds[row, col], 0)
                 T.gemm(ds_shared, q, dk, clear_accum=True)
@@ -134,13 +118,9 @@ def fused_chunk_linear_attn_bwd(
                 T.gemm(q, do, dh, transpose_A=True)
 
                 T.copy(dk, dk_shared)
-                T.atomic_add(
-                    dK[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_k * BK:(i_k + 1) * BK], dk_shared)
+                T.atomic_add(dK[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dk_shared)
                 T.copy(dv, dv_shared)
-                T.atomic_add(
-                    dV[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], dv_shared)
+                T.atomic_add(dV[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], dv_shared)
 
     return fused_chunk_linear_attn_bwd
 
@@ -155,34 +135,31 @@ def tl_fused_chunk_bwd(Q, K, V, dO):
     return dQ.to(torch.float16), dK.to(torch.float16), dV.to(torch.float16)
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=1024, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    do = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q = l2norm_fwd(q)[0].requires_grad_(True)
@@ -193,30 +170,42 @@ def main(B=1, S=1024, H=16, D=128):
     o_ref, _ = ref_program(q, k, v)
     o_ref.backward(do, retain_graph=True)
 
-    assert torch.allclose(
-        dq, q.grad, atol=1e-2, rtol=1e-2), f'dq max err: {(dq - q.grad).abs().max()}'
-    assert torch.allclose(
-        dk, k.grad, atol=1e-2, rtol=1e-2), f'dk max err: {(dk - k.grad).abs().max()}'
-    assert torch.allclose(
-        dv, v.grad, atol=1e-2, rtol=1e-2), f'dv max err: {(dv - v.grad).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(dq, q.grad, atol=1e-2, rtol=1e-2), f"dq max err: {(dq - q.grad).abs().max()}"
+    assert torch.allclose(dk, k.grad, atol=1e-2, rtol=1e-2), f"dk max err: {(dk - k.grad).abs().max()}"
+    assert torch.allclose(dv, v.grad, atol=1e-2, rtol=1e-2), f"dv max err: {(dv - v.grad).abs().max()}"
+    print("Passed all tests!✅")
 
     # Benchmark
     q.grad = k.grad = v.grad = None
     o_ref, _ = fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False)
-    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
+
+
+def run_regression_perf(B=1, S=1024, H=16, D=128):
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    q = l2norm_fwd(q)[0].requires_grad_(True)
+    k = l2norm_fwd(k)[0].requires_grad_(True)
+    kernel = tl_fused_chunk_bwd_kernel(B, S, H, D, D)
+    dQ = torch.zeros_like(q, dtype=torch.float32)
+    dK = torch.zeros_like(k, dtype=torch.float32)
+    dV = torch.zeros_like(v, dtype=torch.float32)
+    kernel(q, k, v, do, dQ, dK, dV)
+    return do_bench(lambda: kernel(q, k, v, do, dQ, dK, dV), backend="cupti")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index 03900a7e6..cdfd5cb72 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -14,20 +14,20 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 def tl_fused_chunk_fwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -38,11 +38,12 @@ def tl_fused_chunk_fwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
-            final_state: T.Tensor([B, H, DK, DV], accum_dtype)):  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        final_state: T.Tensor([B, H, DK, DV], accum_dtype),
+    ):  # type: ignore
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
@@ -57,7 +58,6 @@ def fused_chunk_linear_attn_fwd(
             o = T.alloc_fragment([chunk_size, BV], accum_dtype)
             o_shared = T.alloc_shared([chunk_size, BV], accum_dtype)
 
-            T.annotate_layout({o_shared: tilelang.layout.make_swizzled_layout(o_shared)})
             T.use_swizzle(10)
 
             T.clear(h)
@@ -65,8 +65,8 @@ def fused_chunk_linear_attn_fwd(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -77,12 +77,10 @@ def fused_chunk_linear_attn_fwd(
                 T.gemm(k, v, h, transpose_A=True)
                 T.gemm(q, h_shared, o)
                 T.copy(o, o_shared)
-                T.atomic_add(
-                    O[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                    o_shared)
+                T.atomic_add(O[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], o_shared)
 
             # Output final state
-            T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
+            T.copy(h, final_state[i_b, i_h, i_k * BK : (i_k + 1) * BK, i_v * BV : (i_v + 1) * BV])
 
     return fused_chunk_linear_attn_fwd
 
@@ -91,38 +89,35 @@ def tl_fused_chunk_fwd(q, k, v):
     B, S, H, D = q.shape
     kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
     print(kernel.get_kernel_source())
-    o = torch.zeros((B, S, H, D), device='cuda', dtype=torch.float32)
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
     h = kernel(q, k, v, o)
     return o, h
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=512, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q, _ = l2norm_fwd(q)
@@ -131,25 +126,35 @@ def main(B=1, S=512, H=16, D=128):
     o, h = tl_fused_chunk_fwd(q, k, v)
     o_ref, h_ref = ref_program(q, k, v)
 
-    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f'o max err: {(o - o_ref).abs().max()}'
-    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f'h max err: {(h - h_ref).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f"o max err: {(o - o_ref).abs().max()}"
+    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f"h max err: {(h - h_ref).abs().max()}"
+    print("Passed all tests!✅")
+
+    t1 = do_bench(lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
-    t1 = do_bench(
-        lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False),
-        backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+
+def run_regression_perf(B=1, S=512, H=16, D=128):
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    q, _ = l2norm_fwd(q)
+    k, _ = l2norm_fwd(k)
+    B, S, H, D = q.shape
+    kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
+    return do_bench(lambda: kernel(q, k, v, o), backend="cupti")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_mamba_chunk_scan.py b/examples/linear_attention/example_mamba_chunk_scan.py
index add49052d..88a9b75bc 100644
--- a/examples/linear_attention/example_mamba_chunk_scan.py
+++ b/examples/linear_attention/example_mamba_chunk_scan.py
@@ -9,6 +9,7 @@
 
 def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
     from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+
     out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
     return out
 
@@ -43,14 +44,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -61,12 +63,7 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -77,56 +74,58 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
-            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_shared = T.alloc_shared((block_M, block_K), dtype)
             cb_local = T.alloc_fragment((block_M, block_K), dtype)
-            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype)
             dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
             dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
-            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_shared = T.alloc_shared((block_K), dtype)
             dt_local = T.alloc_fragment((block_K), accum_dtype)
             x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
-            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype)
             scale_m_local = T.alloc_fragment((block_M), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
             prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
             D_local = T.alloc_fragment((1), accum_dtype)
-            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype)
             x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             batch_idx = by % batch
@@ -136,27 +135,31 @@ def main(
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -165,34 +168,47 @@ def main(
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -200,27 +216,40 @@ def main(
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_scan_fwd(
             batch,
             seq_len,
@@ -234,7 +263,8 @@ def main(
             block_K=64,
             block_Dstate=128,
             num_stages=2,
-            threads=128)
+            threads=128,
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_mamba_chunk_state.py b/examples/linear_attention/example_mamba_chunk_state.py
index ad3df0df8..96126889b 100644
--- a/examples/linear_attention/example_mamba_chunk_state.py
+++ b/examples/linear_attention/example_mamba_chunk_state.py
@@ -10,6 +10,7 @@
 
 def chunk_state_triton(B, x, dt, dA_cumsum):
     from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_fwd
+
     return _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=False)
 
 
@@ -41,46 +42,33 @@ def ref_program(B, x, dt, dA_cumsum):
     x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
     B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
     decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype),
-                        dt.to(x.dtype), x)
+    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[4])
-def chunk_state_fwd(batch,
-                    seqlen,
-                    chunk_size,
-                    ngroups,
-                    nheads,
-                    headdim,
-                    dstate,
-                    block_M=64,
-                    block_N=64,
-                    block_K=64,
-                    num_stages=2,
-                    threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_state_fwd(
+    batch, seqlen, chunk_size, ngroups, nheads, headdim, dstate, block_M=64, block_N=64, block_K=64, num_stages=2, threads=128
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
-    def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
-        (batch, seqlen, nheads, headdim), dtype), dt: T.Tensor(
-            (batch, nheads, nchunks, chunk_size), dtype), dA_cumsum: T.Tensor(
-                (batch, nheads, nchunks, chunk_size), dtype), Output: T.Tensor(
-                    (batch, nchunks, nheads, headdim, dstate), dtype)):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+    def main(
+        B: T.Tensor((batch, seqlen, ngroups, dstate), dtype),
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        Output: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),
+    ):
+        with T.Kernel(nheads, T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N), batch * nchunks, threads=threads) as (bz, bx, by):
             x_shared = T.alloc_shared((block_K, block_M), dtype)
             x_local = T.alloc_fragment((block_K, block_M), dtype)
             xt_local = T.alloc_fragment((block_M, block_K), dtype)
@@ -101,20 +89,22 @@ def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
             m_idx = bx // T.ceildiv(dstate, block_N)
             n_idx = bx % T.ceildiv(dstate, block_N)
 
-            T.annotate_layout({
-                x_shared: tilelang.layout.make_swizzled_layout(x_shared),
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)
-            })
+            T.annotate_layout({x_shared: tilelang.layout.make_swizzled_layout(x_shared)})
 
             dA_cs_last[0] = dA_cumsum[batch_idx, bz, chunk_idx, chunk_size - 1]
             T.clear(acc_o)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, m_idx * block_M:(m_idx + 1) * block_M], x_shared)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cumsum_shared)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                    ],
+                    x_shared,
+                )
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cumsum_shared)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dA_cumsum_shared, dA_cumsum_local)
                 T.copy(dt_shared, dt_local)
                 for i in T.Parallel(block_K):
@@ -123,47 +113,50 @@ def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
                 for i, j in T.Parallel(block_M, block_K):
                     xt_local[i, j] = x_local[j, i] * scale[j]
                 T.copy(
-                    B[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz // (nheads // ngroups),
-                      n_idx * block_N:(n_idx + 1) * block_N], B_shared)
+                    B[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz // (nheads // ngroups),
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    B_shared,
+                )
                 T.gemm(xt_local, B_shared, acc_o)
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx, bz, m_idx * block_M:(m_idx + 1) * block_M,
-                       n_idx * block_N:(n_idx + 1) * block_N])
+                Output[batch_idx, chunk_idx, bz, m_idx * block_M : (m_idx + 1) * block_M, n_idx * block_N : (n_idx + 1) * block_N],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_state_fwd(
-            batch,
-            seq_len,
-            chunk_size,
-            groups,
-            heads,
-            dim,
-            dstate,
-            block_M=64,
-            block_N=128,
-            block_K=64,
-            num_stages=4,
-            threads=128)
+            batch, seq_len, chunk_size, groups, heads, dim, dstate, block_M=64, block_N=128, block_K=64, num_stages=4, threads=128
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index 59445419a..f45e38388 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -13,13 +13,12 @@ def chunk_retention_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -30,16 +29,16 @@ def chunk_retention_fwd_kernel(
 
     @T.prim_func
     def chunk_retention_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
-            log_decay = T.alloc_var('float32')
-            log_decay = T.log2(1 - T.exp2(-5. - 1. * i_h))  # Head-specific log decay
+            log_decay = T.alloc_var(T.float32)
+            log_decay = T.log2(1 - T.exp2(-5.0 - 1.0 * i_h))  # Head-specific log decay
 
             q = T.alloc_shared([chunk_size, BK], dtype)
             k = T.alloc_shared([chunk_size, BK], dtype)
@@ -56,14 +55,12 @@ def chunk_retention_fwd(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
-                    s_shared[row,
-                             col] = T.if_then_else(row >= col, s[row, col] * T.exp2(
-                                 (row - col) * log_decay), 0)
+                    s_shared[row, col] = T.if_then_else(row >= col, s[row, col] * T.exp2((row - col) * log_decay), 0)
 
                 T.copy(h, h_shared)
                 T.gemm(q, h_shared, o, clear_accum=True)
@@ -75,9 +72,7 @@ def chunk_retention_fwd(
                     v[row, col] = v[row, col] * T.exp2((chunk_size - row - 1) * log_decay)
                 for row, col in T.Parallel(BK, BV):
                     h[row, col] = T.exp2(chunk_size * log_decay) * h[row, col]
-                T.copy(
-                    o, O[i_k, i_b, i * chunk_size:(i + 1) * chunk_size, i_h,
-                         i_v * BV:(i_v + 1) * BV])
+                T.copy(o, O[i_k, i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV])
                 T.gemm(k, v, h, transpose_A=True)
 
     return chunk_retention_fwd
@@ -89,24 +84,24 @@ def postprocess(o):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=4096, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=4096, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
     B, S, H, D = args.B, args.S, args.H, args.D
     total_flops = 2.0 * B * S * S * H * D  # causal
 
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     kernel = chunk_retention_fwd_kernel(B, S, H, D, D)
 
     t = do_bench(lambda: postprocess(kernel(q, k, v)), warmup=25, rep=100)
-    print(f'Tilelang latency: {t:.3f} ms')
-    print(f'Tilelang TFLOPs: {total_flops/t * 1e-9}')
+    print(f"Tilelang latency: {t:.3f} ms")
+    print(f"Tilelang TFLOPs: {total_flops / t * 1e-9}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/linear_attention/regression_linear_attn.py b/examples/linear_attention/regression_linear_attn.py
new file mode 100644
index 000000000..ced854087
--- /dev/null
+++ b/examples/linear_attention/regression_linear_attn.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_linear_attn_bwd
+import example_linear_attn_fwd
+
+
+def regression_example_linear_attn_fwd():
+    tilelang.testing.process_func(example_linear_attn_fwd.run_regression_perf)
+
+
+def regression_example_linear_attn_bwd():
+    tilelang.testing.process_func(example_linear_attn_bwd.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index ebf8513a1..0fb7cf350 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -15,12 +15,11 @@
 
 @tilelang.jit(out_idx=[3])
 def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
-
     block_M = 64
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
     shape = [batch, heads, seq_len, dim]
 
     seq_blocks = (seq_len + block_M - 1) // block_M
@@ -30,15 +29,13 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
     offset_shape = count_shape + [slash_size]
     index_shape = count_shape + [vertical_size]
 
-    vertical_size_round, slash_size_round = tilelang.next_power_of_2(
-        vertical_size), tilelang.next_power_of_2(slash_size)
+    vertical_size_round, slash_size_round = tilelang.next_power_of_2(vertical_size), tilelang.next_power_of_2(slash_size)
 
-    dtype = "float16"
-    accum_dtype = "float"
-    int_dtype = "int32"
+    dtype = T.float16
+    accum_dtype = T.float32
+    int_dtype = T.int32
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Prefetch(
             K: T.Tensor(shape, dtype),
@@ -53,32 +50,30 @@ def Prefetch(
         ):
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    K_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    K[bz, by, column_index[k + i], j], 0)
+                    K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
 
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    V_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    V[bz, by, column_index[k + i], j], 0)
+                    V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
 
             T.ptx_commit_group()
 
         @T.macro
         def Compute(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                k: T.int32,
-                column_count: T.int32,
-                Q_shared: T.SharedBuffer([block_M, dim], dtype),
-                K_shared: T.SharedBuffer([block_N, dim], dtype),
-                V_shared: T.SharedBuffer([block_N, dim], dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-                count: T.int32,
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            k: T.int32,
+            column_count: T.int32,
+            Q_shared: T.SharedBuffer([block_M, dim], dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            V_shared: T.SharedBuffer([block_N, dim], dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
+            count: T.int32,
         ):
             T.ptx_wait_group(count)
             for i, j in T.Parallel(block_M, block_N):
@@ -87,6 +82,8 @@ def Compute(
 
             T.copy(scores_max, scores_max_prev)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -106,17 +103,16 @@ def Compute(
 
         @T.prim_func
         def vs_sparse_flashattn_ws(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                Output: T.Tensor(shape, dtype),
-                BlockCount: T.Tensor(count_shape, int_dtype),
-                BlockOffset: T.Tensor(offset_shape, int_dtype),
-                ColumnCount: T.Tensor(count_shape, int_dtype),
-                ColumnIndex: T.Tensor(index_shape, int_dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            Output: T.Tensor(shape, dtype),
+            BlockCount: T.Tensor(count_shape, int_dtype),
+            BlockOffset: T.Tensor(offset_shape, int_dtype),
+            ColumnCount: T.Tensor(count_shape, int_dtype),
+            ColumnIndex: T.Tensor(index_shape, int_dtype),
         ):
             with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bc, by, bz):
-
                 bx = T.ceildiv(seq_len, block_M) - 1 - bc
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([2, block_N, dim], dtype)
@@ -134,19 +130,15 @@ def vs_sparse_flashattn_ws(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_count = T.alloc_local([1], int_dtype)
+                block_count = T.alloc_var(dtype=int_dtype)
                 block_offset = T.alloc_shared([slash_size_round], int_dtype, scope="shared")
-                column_count = T.alloc_local([1], int_dtype)
+                column_count = T.alloc_var(dtype=int_dtype)
                 column_index = T.alloc_shared([vertical_size_round], int_dtype, scope="shared")
 
-                T.create_list_of_mbarrier([128] * 9)
+                mbars = T.alloc_barrier([128] * 9)
 
-                T.annotate_layout({
-                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                })
-
-                block_count[0] = BlockCount[bz, by, bx]
-                column_count[0] = ColumnCount[bz, by, bx]
+                block_count = BlockCount[bz, by, bx]
+                column_count = ColumnCount[bz, by, bx]
 
                 for vi in T.Parallel(slash_size_round):
                     if vi < slash_size:
@@ -160,97 +152,135 @@ def vs_sparse_flashattn_ws(
 
                 if tid >= 128:
                     T.annotate_producer_reg_dealloc()
-                    T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
-                    T.mbarrier_arrive(mbarrier=8)
-                    for bi in T.serial(block_count[0]):
+                    T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
+                    T.mbarrier_arrive(mbarrier=mbars[8])
+                    for bi in T.serial(block_count):
                         k = block_offset[bi]
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 4, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(K[bz, by, k:k + block_N, :], K_shared[bi % 2, :, :])
-                        T.mbarrier_arrive(mbarrier=bi % 2)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 6, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(V[bz, by, k:k + block_N, :], V_shared[bi % 2, :, :])
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 2)
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 4], parity=(((bi & 3) >> 1) ^ 1))
+                        T.copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :])
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2])
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 6], parity=(((bi & 3) >> 1) ^ 1))
+                        T.copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :])
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 2])
                 else:
                     T.annotate_consumer_reg_alloc()
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
-                    T.mbarrier_wait_parity(mbarrier=8, parity=0)
-                    for bi in T.serial(block_count[0]):
+                    T.mbarrier_wait_parity(mbarrier=mbars[8], parity=0)
+                    for bi in T.serial(block_count):
                         k = block_offset[bi]
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0,
-                                                         -T.infinity(acc_s.dtype))
-
-                        T.mbarrier_wait_parity(mbarrier=bi % 2, parity=((bi & 3) >> 1))
-                        T.gemm(
-                            Q_shared,
-                            K_shared[bi % 2, :, :],
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 4)
+                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0, -T.infinity(acc_s.dtype))
+
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2], parity=((bi & 3) >> 1))
+                        T.gemm(Q_shared, K_shared[bi % 2, :, :], acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 4])
 
                         T.copy(scores_max, scores_max_prev)
 
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, dim):
                             acc_o[i, j] = acc_o[i, j] * scores_scale[i]
 
                         T.copy(acc_s, acc_s_cast)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=(((bi & 3) >> 1)))
-                        T.gemm(
-                            acc_s_cast,
-                            V_shared[bi % 2, :, :],
-                            acc_o,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 2], parity=((bi & 3) >> 1))
+                        T.gemm(acc_s_cast, V_shared[bi % 2, :, :], acc_o, policy=T.GemmWarpPolicy.FullRow)
 
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 6)
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 6])
 
                         T.reduce_sum(acc_s, scores_sum, dim=1)
 
                         for i in T.Parallel(block_M):
                             logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
-                    if column_count[0] != 0:
-                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz,
-                                 by)
-                        for bi in T.serial(T.ceildiv(column_count[0], block_N) - 1):
+                    if column_count != 0:
+                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count, 0, bz, by)
+                        for bi in T.serial(T.ceildiv(column_count, block_N) - 1):
                             k = bi * block_N
                             if bi % 2 == 0:
-                                Prefetch(K, V, K_shared_2, V_shared_2, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_1, V_shared_1,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_2, V_shared_2, column_index, column_count, k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count,
+                                    Q_shared,
+                                    K_shared_1,
+                                    V_shared_1,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                             else:
-                                Prefetch(K, V, K_shared_1, V_shared_1, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_2, V_shared_2,
-                                        scores_scale, scores_sum, logsum, 1)
-                        if T.ceildiv(column_count[0], block_N) % 2 == 0:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
-                                    scores_sum, logsum, 0)
+                                Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count, k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count,
+                                    Q_shared,
+                                    K_shared_2,
+                                    V_shared_2,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
+                        if T.ceildiv(column_count, block_N) % 2 == 0:
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count, block_N) * block_N - block_N,
+                                column_count,
+                                Q_shared,
+                                K_shared_2,
+                                V_shared_2,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                         else:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count, block_N) * block_N - block_N,
+                                column_count,
+                                Q_shared,
+                                K_shared_1,
+                                V_shared_1,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] /= logsum[i]
                     T.copy(acc_o, O_shared)
-                    T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                    T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return vs_sparse_flashattn_ws
 
@@ -466,11 +496,8 @@ def vertical_slash_sparse_attention(
     import os
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    sources = [
-        os.path.join(current_dir, 'ops', 'kernels.cpp'),
-        os.path.join(current_dir, 'ops', 'vertical_slash_index.cu')
-    ]
-    ops = load(name='convert', sources=sources, verbose=False)
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
     convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
     batch_size, num_heads, context_size, head_dim = query.shape
     pad = (block_size_M - context_size) & (block_size_M - 1)
@@ -481,15 +508,13 @@ def vertical_slash_sparse_attention(
     value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
 
     if head_dim not in [16, 32, 64, 128, 256, 512]:
-        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
         query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
         key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
         value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
 
-    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=False)[0]
-    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=True)[0]
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
 
     seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
     sm_scale = head_dim**-0.5
@@ -502,8 +527,7 @@ def vertical_slash_sparse_attention(
         block_size_N,
     )
 
-    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim,
-                                        v_idx.shape[2], s_idx.shape[2])
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, v_idx.shape[2], s_idx.shape[2])
 
     def run(is_triton: bool = True):
         if is_triton:
@@ -521,8 +545,7 @@ def run(is_triton: bool = True):
                 block_size_N,
             )
         else:
-            out = tl_kernel(query, key, value, block_count, block_offset, column_count,
-                            column_index)
+            out = tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
         return out[..., :context_size, :head_dim]
 
     return run
@@ -532,47 +555,34 @@ def sum_all_diagonal_matrix(mat: torch.tensor):
     b, h, n, m = mat.shape
     zero_mat = torch.zeros((b, h, n, n)).to(mat.device)  # Zero matrix used for padding
     mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)  # pads the matrix on left and right
-    mat_strided = mat_padded.as_strided(
-        (1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
+    mat_strided = mat_padded.as_strided((1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
     sum_diags = torch.sum(mat_strided, 2)  # Sums the resulting matrix's columns
     return sum_diags[:, :, 1:]
 
 
-def main(argv=None):
-    parser = argparse.ArgumentParser()
+def main(batch=1, heads=1, seq_len=4096, head_dim=64, vertical_size=1000, slash_size=200):
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = batch, heads, seq_len, head_dim
 
-    parser.add_argument("--batch", type=int, default=1)
-    parser.add_argument("--heads", type=int, default=1)
-    parser.add_argument("--seq_len", type=int, default=16384)
-    parser.add_argument("--head_dim", type=int, default=64)
-    parser.add_argument("--vertical_size", type=int, default=1000)
-    parser.add_argument("--slash_size", type=int, default=200)
-
-    args = parser.parse_args(argv)
-
-    BATCH, N_HEADS, SEQ_LEN, D_HEAD = args.batch, args.heads, args.seq_len, args.head_dim
-
-    vertical_size, slash_size = args.vertical_size, args.slash_size
+    vertical_size, slash_size = vertical_size, slash_size
 
     torch.manual_seed(0)
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     q_len = SEQ_LEN
 
     vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
     last_q = 64
-    qk = torch.einsum('bhmk, bhnk -> bhmn', q[:, :, -last_q:, :], k)
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
     arange = torch.arange(last_q, device="cuda")
-    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :],
-                                        qk[:, :, :, -last_q:], -torch.inf)
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
     qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
     vertical = qk.sum(-2, keepdim=True)
     vertical[..., :30] = torch.inf
     vertical_topk = torch.topk(vertical, vertical_size, -1).indices
 
-    slash = sum_all_diagonal_matrix(qk)[..., :-last_q + 1]
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
     slash[..., -30:] = torch.inf
 
     slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
@@ -592,5 +602,84 @@ def main(argv=None):
     print(f"speedup: {triton_time / tilelang_time:.2f}x")
 
 
+def run_regression_perf(batch=1, heads=1, seq_len=16384, head_dim=64, vertical_size=1000, slash_size=200):
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = batch, heads, seq_len, head_dim
+    torch.manual_seed(0)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    q_len = SEQ_LEN
+    vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
+    last_q = 64
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
+    arange = torch.arange(last_q, device="cuda")
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
+    qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+    vertical = qk.sum(-2, keepdim=True)
+    vertical[..., :30] = torch.inf
+    vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
+    slash[..., -30:] = torch.inf
+    slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+    block_size_M = 64
+    block_size_N = 64
+    query, key, value = q, k, v
+    v_idx, s_idx = vertical_topk, slash
+    batch_size, num_heads, context_size, head_dim = query.shape
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    from torch.utils.cpp_extension import load
+    import os
+
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
+    convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
+    batch_size, num_heads, context_size, head_dim = query.shape
+    pad = (block_size_M - context_size) & (block_size_M - 1)
+    if pad == block_size_M:
+        pad = 0
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
+    block_count, block_offset, column_count, column_index = convert_vertical_slash_indexes(
+        seqlens,
+        v_idx,
+        s_idx,
+        context_size,
+        block_size_M,
+        block_size_N,
+    )
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, vertical_topk.shape[-1], slash.shape[-1])
+
+    def run_kernel_only():
+        tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--heads", type=int, default=1)
+    parser.add_argument("--seq_len", type=int, default=16384)
+    parser.add_argument("--head_dim", type=int, default=64)
+    parser.add_argument("--vertical_size", type=int, default=1000)
+    parser.add_argument("--slash_size", type=int, default=200)
+    args = parser.parse_args()
+    main(
+        batch=args.batch,
+        heads=args.heads,
+        seq_len=args.seq_len,
+        head_dim=args.head_dim,
+        vertical_size=args.vertical_size,
+        slash_size=args.slash_size,
+    )
diff --git a/examples/minference/regression_vs_sparse_attn.py b/examples/minference/regression_vs_sparse_attn.py
new file mode 100644
index 000000000..ca2ddf58f
--- /dev/null
+++ b/examples/minference/regression_vs_sparse_attn.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_vertical_slash_sparse_attn
+
+
+def regression_example_vertical_slash_sparse_attn():
+    tilelang.testing.process_func(example_vertical_slash_sparse_attn.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/minference/test_vs_sparse_attn.py b/examples/minference/test_vs_sparse_attn.py
index f01df3808..9e6741dcf 100644
--- a/examples/minference/test_vs_sparse_attn.py
+++ b/examples/minference/test_vs_sparse_attn.py
@@ -5,7 +5,7 @@
 
 @tilelang.testing.requires_cuda
 def test_vs_sparse_attn():
-    example_vertical_slash_sparse_attn.main(argv=[])
+    example_vertical_slash_sparse_attn.main()
 
 
 if __name__ == "__main__":
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index 25bac50fc..57bccc1a0 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -4,7 +4,7 @@
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -21,7 +21,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
                     A_local[i, j] += A_shared[i, j] * A_shared[i, j]
             T.reduce_sum(A_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
 
             for k in range(num_k_step):
                 # reverse, better cache hit rate
@@ -35,7 +35,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
 
 @tilelang.jit(out_idx=[-1], pass_configs={"tl.disable_tma_lower": True})
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,16 +45,16 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
             T.reduce_sum(A_pow_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index 8cc413531..53db03d98 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -5,7 +5,7 @@
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -22,7 +22,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
                     A_local[i, j] += A_shared[i, j] * A_shared[i, j]
             T.reduce_sum(A_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
 
             for k in range(num_k_step):
                 # reverse, better cache hit rate
@@ -35,7 +35,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
 
 
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,16 +45,16 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
             T.reduce_sum(A_pow_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/online_softmax/online_softmax.py b/examples/online_softmax/online_softmax.py
index 432482d06..811870e44 100644
--- a/examples/online_softmax/online_softmax.py
+++ b/examples/online_softmax/online_softmax.py
@@ -9,19 +9,19 @@
 def softmax_kernel(
     M,
     N,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ) -> "Callable":
     BN = min(tl.next_power_of_2(N), 8192)
     NN = tl.cdiv(N, BN)
 
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     scale = 1.44269504  # log2(e)
 
     @T.prim_func
     def main(
-            X: T.Tensor([M, N], dtype),
-            Y: T.Tensor([M, N], dtype),
+        X: T.Tensor([M, N], dtype),
+        Y: T.Tensor([M, N], dtype),
     ):
         with T.Kernel(M, threads=128) as (i_m):
             x = T.alloc_fragment([BN], dtype)
@@ -33,7 +33,7 @@ def main(
             T.fill(lse, -T.infinity(accum_dtype))
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 T.reduce_max(x, max_x, dim=0, clear=True)
 
@@ -45,12 +45,12 @@ def main(
                 lse[0] = max_x[0] * scale + T.log2(T.exp2(lse[0] - max_x[0] * scale) + sum_exp_x[0])
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 for j in T.Parallel(BN):
                     y[j] = T.exp2(x[j] * scale - lse[0])
 
-                T.copy(y, Y[i_m, i_n * BN:(i_n + 1) * BN])
+                T.copy(y, Y[i_m, i_n * BN : (i_n + 1) * BN])
 
     return main
 
@@ -69,4 +69,4 @@ def main(
 t2 = do_bench(lambda: kernel(X), warmup=25, rep=100)
 print(f"torch latency: {t1:.3f} ms")
 print(f"TileLang latency: {t2:.3f} ms")
-print(f"Speedup: {t1/t2:.3f}x")
+print(f"Speedup: {t1 / t2:.3f}x")
diff --git a/examples/plot_layout/README.md b/examples/plot_layout/README.md
index a65d771c2..8204e93d8 100644
--- a/examples/plot_layout/README.md
+++ b/examples/plot_layout/README.md
@@ -10,7 +10,7 @@ from typing import Literal, Callable
 from tilelang.intrinsics.utils import get_mma_micro_size
 from tilelang.tools import plot_layout
 
-def make_mma_load_base_layout(dtype: str = "float16",
+def make_mma_load_base_layout(dtype: str = T.float16,
                               matrix: Literal["A", "B"] = "A",
                               transposed: bool = False) -> T.Fragment:
     """
@@ -69,7 +69,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
     micro_size_s, _, micro_size_r = get_mma_micro_size(dtype)
 
     transform_func = transform_func
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -94,7 +94,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
 
 
 # Create a 16×16 matrix layout for ldmatrix operations
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 
 # Print the layout structure (optional for debugging)
 print(base_layout)
diff --git a/examples/plot_layout/fragment_mfma_load_a.py b/examples/plot_layout/fragment_mfma_load_a.py
index 2c3b282a6..d45cc227b 100644
--- a/examples/plot_layout/fragment_mfma_load_a.py
+++ b/examples/plot_layout/fragment_mfma_load_a.py
@@ -11,10 +11,9 @@
 )
 
 
-def make_mfma_load_base_layout(dtype: str = "float16",
-                               matrix: Literal["A", "B"] = "A",
-                               k_dim: int = 16,
-                               transposed: bool = False) -> T.Fragment:
+def make_mfma_load_base_layout(
+    dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
+) -> T.Fragment:
     """
     Create a layout function for storing MFMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mfma_store_layout` to
@@ -72,17 +71,15 @@ def make_mfma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -115,19 +112,16 @@ def forward_index(i: int, j: int) -> int:
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mfma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mfma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
 # warp layout 32x32
-warp_layout = base_layout.repeat([warp_rows, warp_cols],
-                                 repeat_on_thread=False,
-                                 lower_dim_first=False)
+warp_layout = base_layout.repeat([warp_rows, warp_cols], repeat_on_thread=False, lower_dim_first=False)
 print(warp_layout)
 plot_layout(warp_layout, name="warp_layout")
 
 # block layout 64x32
-block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True,
-                                  lower_dim_first=True).replicate(block_cols)
+block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True, lower_dim_first=True).replicate(block_cols)
 print(block_layout)
 plot_layout(block_layout, name="block_layout")
diff --git a/examples/plot_layout/fragment_mma_load_a.py b/examples/plot_layout/fragment_mma_load_a.py
index 988899448..df4a0b887 100644
--- a/examples/plot_layout/fragment_mma_load_a.py
+++ b/examples/plot_layout/fragment_mma_load_a.py
@@ -5,9 +5,7 @@
 from tilelang.intrinsics.utils import get_mma_micro_size
 
 
-def make_mma_load_base_layout(dtype: str = "float16",
-                              matrix: Literal["A", "B"] = "A",
-                              transposed: bool = False) -> T.Fragment:
+def make_mma_load_base_layout(dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
     """
     Create a layout function for storing MMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -36,6 +34,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
         shared_16x16_to_mma_32x8_layout_sr_b,
         shared_16x32_to_mma_32x16_layout_sr_b,
     )
+
     assert matrix in ["A", "B"], "matrix should be either A or B"
     dtype_bits = DataType(dtype).bits
     # s represents spatial axis
@@ -67,17 +66,15 @@ def make_mma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -110,7 +107,7 @@ def forward_index(i: int, j: int) -> int:
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
diff --git a/examples/plot_layout/layout_swizzle.py b/examples/plot_layout/layout_swizzle.py
new file mode 100644
index 000000000..c8bd52600
--- /dev/null
+++ b/examples/plot_layout/layout_swizzle.py
@@ -0,0 +1,32 @@
+from tilelang.layout import (
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+)
+from tilelang.tools import plot_layout
+
+element_size = 16  # float16 = 16 bits
+
+
+# ---- Plot the swizzle patterns ----
+
+# 1. Quarter-bank (32B) — 1-bit XOR — 8x16
+# Rows 0-3: identity; Rows 4-7: two 8-element halves swap
+layout = make_quarter_bank_swizzled_layout(8, 16, element_size)
+print(f"Quarter-bank swizzle (8x16, fp16): {layout}")
+plot_layout(layout, name="swizzle_quarter_8x16")
+
+# 2. Half-bank (64B) — 2-bit XOR — 8x32
+layout = make_half_bank_swizzled_layout(8, 32, element_size)
+print(f"Half-bank swizzle (8x32, fp16): {layout}")
+plot_layout(layout, name="swizzle_half_8x32")
+
+# 3. Full-bank (128B) — 3-bit XOR — 8x64
+layout = make_full_bank_swizzled_layout(8, 64, element_size)
+print(f"Full-bank swizzle (8x64, fp16): {layout}")
+plot_layout(layout, name="swizzle_full_8x64")
+
+# 4. Full-bank (128B) — multi-tile: 32x64
+layout = make_full_bank_swizzled_layout(32, 64, element_size)
+print(f"Full-bank swizzle (32x64, fp16): {layout}")
+plot_layout(layout, name="swizzle_full_32x64")
diff --git a/examples/plot_layout/layout_transform.py b/examples/plot_layout/layout_transform.py
new file mode 100644
index 000000000..144b3744f
--- /dev/null
+++ b/examples/plot_layout/layout_transform.py
@@ -0,0 +1,24 @@
+import tilelang.language as T
+from tilelang.tools import plot_layout
+
+# --- Example 1: Simple 2D Transpose (4x4) ---
+transpose_layout = T.Layout([4, 4], lambda i, j: (j, i))
+print("Transpose 4x4:", transpose_layout)
+plot_layout(transpose_layout, name="transpose_4x4")
+
+# --- Example 2: Larger Transpose (8x8) ---
+transpose_8x8 = T.Layout([8, 8], lambda i, j: (j, i))
+print("Transpose 8x8:", transpose_8x8)
+plot_layout(transpose_8x8, name="transpose_8x8")
+
+# --- Example 3: 3D → 2D reshape + transpose ---
+# (i, j, k) with shape [2, 4, 8] → (k, i*4+j)
+reshape_layout = T.Layout([2, 4, 8], lambda i, j, k: (k, i * 4 + j))
+print("Reshape 3D [2,4,8] -> [8,8]:", reshape_layout)
+plot_layout(reshape_layout, name="reshape_3d_to_2d")
+
+# --- Example 4: Interleave layout ---
+# Even rows from first half, odd rows from second half
+interleave = T.Layout([8, 4], lambda i, j: (i % 4 * 2 + i // 4, j))
+print("Interleave [8,4]:", interleave)
+plot_layout(interleave, name="interleave_8x4")
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 42514ee39..e99fc0dbc 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -6,13 +6,12 @@
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -55,10 +54,9 @@ def matmul_relu_kernel(
 block_N = 128
 block_K = 32
 
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
+# Define the kernel (matmul) and compile/lower it into an executable module
 matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
+# Test the kernel in Python with PyTorch data
 import torch
 
 # Create random input tensors on the GPU
@@ -78,7 +76,7 @@ def matmul_relu_kernel(
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
diff --git a/examples/rand/rand_uint.py b/examples/rand/rand_uint.py
new file mode 100644
index 000000000..466a51b7a
--- /dev/null
+++ b/examples/rand/rand_uint.py
@@ -0,0 +1,57 @@
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+import triton.language as tl
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, elements_per_thread, seed):
+    pid = tl.program_id(0)
+    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
+
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+
+    base_idx = offset * 4
+    tl.store(X + base_idx, r0, mask=base_idx < M)
+    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
+    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
+    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
+
+
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    grid = (triton.cdiv(M, 128),)
+    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
+
+    torch.testing.assert_close(tilelang_result, triton_result)
+
+
+if __name__ == "__main__":
+    test_rand_1d(1024, 42)
+    test_rand_1d(512, 123)
+    test_rand_1d(128, 0)
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index dcd581c6b..465a216cf 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,70 +27,33 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "int8"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.int8
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
-        @T.macro
-        def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            T.copy(scores_max, scores_max_prev)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-            # in the first ceil_div(kBlockM, kBlockN) steps.
-            # for i in T.Parallel(block_M):
-            #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-            for i in T.Parallel(block_M):
-                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-            for i, j in T.Parallel(block_M, block_N):
-                # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-                # max * log_2(e)) This allows the compiler to use the ffma
-                # instruction instead of fadd and fmul separately.
-                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-            T.reduce_sum(acc_s, scores_sum, dim=1)
-            for i in T.Parallel(block_M):
-                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-            T.copy(acc_s, acc_s_cast)
-
-        @T.macro
-        def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] *= scores_scale[i]
-
         @T.prim_func
         def main(
-                Q: T.Tensor(q_shape, dtype),
-                K: T.Tensor(kv_shape, dtype),
-                V: T.Tensor(kv_shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(q_shape, dtype),
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(q_shape, dtype),
         ):
             with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -108,47 +68,61 @@ def main(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = T.ceildiv(seq_kv, block_N)
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                         if is_causal:
                             past_len = seq_kv - seq_q
                             for i, j in T.Parallel(block_M, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    bx * block_M + i + past_len >= k * block_N + j, 0,
-                                    -T.infinity(acc_s.dtype))
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i + past_len >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                         else:
                             T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
-                        Rescale(acc_o, scores_scale)
-                        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        T.copy(scores_max, scores_max_prev)
+                        T.fill(scores_max, -T.infinity(accum_dtype))
+                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                        # in the first ceil_div(kBlockM, kBlockN) steps.
+                        # for i in T.Parallel(block_M):
+                        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                        for i in T.Parallel(block_M):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                            # max * log_2(e)) This allows the compiler to use the ffma
+                            # instruction instead of fadd and fmul separately.
+                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                        T.reduce_sum(acc_s, scores_sum, dim=1)
+                        for i in T.Parallel(block_M):
+                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                        T.copy(acc_s, acc_s_cast)
+
+                        for i, j in T.Parallel(block_M, dim):
+                            acc_o[i, j] *= scores_scale[i]
+
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
 
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -163,44 +137,40 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.float16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
     # Run tilelang kernel
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
 
     # Verify accuracy
-    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "TileLang output doesn't match reference"
+    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), "TileLang output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -213,42 +183,40 @@ def test_topk_sparse_attention_qlen_lt_klen():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.float16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
     print(kernel.get_kernel_source())
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
@@ -264,5 +232,55 @@ def main():
     test_topk_sparse_attention_qlen_lt_klen()
 
 
+def run_regression_perf():
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = 4, 2, 256, 64
+    TOPK = 2
+    BLOCK = 64
+    torch.manual_seed(0)
+
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(q, k, v, block_mask.to(torch.int8))
+
+    latency_1 = do_bench(run_kernel_only, backend="cupti")
+
+    BATCH, N_HEADS = 1, 1
+    Q_LEN, K_LEN, D_HEAD = 128, 256, 64
+    TOPK = 1
+    BLOCK = 64
+    torch.manual_seed(0)
+
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(K_LEN / downsample_factor)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+
+    def run_kernel_only2():
+        kernel(q, k, v, block_mask.to(torch.int8))
+
+    latency_2 = do_bench(run_kernel_only2, backend="cupti")
+
+    return (latency_1 + latency_2) / 2
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/seer_attention/block_sparse_attn_triton.py b/examples/seer_attention/block_sparse_attn_triton.py
index ed33cc1e2..b4cc3cd00 100644
--- a/examples/seer_attention/block_sparse_attn_triton.py
+++ b/examples/seer_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -54,7 +51,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -69,7 +65,7 @@ def _fwd_kernel_inner(
         qk *= sm_scale
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
-        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -149,7 +145,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -185,24 +181,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -247,7 +231,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,9 +254,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -281,9 +264,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -295,22 +276,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -322,16 +302,15 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -340,26 +319,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/seer_attention/regression_block_sparse_attn_tilelang.py b/examples/seer_attention/regression_block_sparse_attn_tilelang.py
new file mode 100644
index 000000000..86d7b3b28
--- /dev/null
+++ b/examples/seer_attention/regression_block_sparse_attn_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import block_sparse_attn_tilelang
+
+
+def regression_block_sparse_attn_tilelang():
+    tilelang.testing.process_func(block_sparse_attn_tilelang.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py b/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py
new file mode 100644
index 000000000..1167c1603
--- /dev/null
+++ b/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py
@@ -0,0 +1,11 @@
+import tilelang.testing
+import tilelang
+import tilelang_example_sparse_tensorcore
+
+
+def regression_example_sparse_tensorcore():
+    tilelang.testing.process_func(tilelang_example_sparse_tensorcore.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 59c79c283..f33832aff 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -1,7 +1,8 @@
 import torch
 import tilelang
 from tilelang.utils.sparse import compress_sm90
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang import language as T
 import tilelang.testing
 
 
@@ -24,32 +25,24 @@ def matmul_sp(
     A_shared_shape = (block_M, block_K // 2)
     B_shared_shape = (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // 8), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // 8), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // 8), 'uint8')
+            E_shared = T.alloc_shared((block_M, block_K // 8), "uint8")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                }
+            )
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // 8], E_shared)
@@ -61,7 +54,7 @@ def main(
     return main
 
 
-def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device='cpu'):
+def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device="cpu"):
     if shape[-1] % 4 != 0:
         raise ValueError("Last dimension must be divisible by 4 for 2:4 sparsity.")
 
@@ -106,9 +99,9 @@ def run_gemm_sp(
         num_threads,
     )
 
-    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device='cuda')
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
     A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
-    B = torch.randn((K, N), device='cuda', dtype=torch.float16)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
 
     C_sp = kernel(A_sparse, E, B).half()
     C = torch.matmul(A, B)
@@ -117,7 +110,46 @@ def run_gemm_sp(
 
 
 def main():
-    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 2, 128)
+    run_gemm_sp(512, 1024, 768, T.float16, T.float16, T.float32, 128, 128, 128, 2, 128)
+
+
+def run_regression_perf():
+    M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, num_threads = (
+        512,
+        1024,
+        768,
+        128,
+        128,
+        128,
+        "float16",
+        "float16",
+        "float32",
+        2,
+        128,
+    )
+    kernel = matmul_sp(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        num_threads,
+    )
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
+    A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(A_sparse, E, B)
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/topk/example_topk.py b/examples/topk/example_topk.py
index 0ca19fb18..ed5ba0d4a 100644
--- a/examples/topk/example_topk.py
+++ b/examples/topk/example_topk.py
@@ -22,19 +22,19 @@ def tl_topk(
     blk_m,
     threads=128,
 ):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def topk_kernel(
-            logits: T.Tensor([M, N], dtype),
-            topk_gates: T.Tensor([M, topk], dtype),
-            topk_indices: T.Tensor([M, topk], "int32"),
+        logits: T.Tensor([M, N], dtype),
+        topk_gates: T.Tensor([M, topk], dtype),
+        topk_indices: T.Tensor([M, topk], T.int32),
     ):
         with T.Kernel(T.ceildiv(M, blk_m), threads=threads) as bx:
             logits_frag = T.alloc_fragment([blk_m, N], dtype=dtype)
             max_val = T.alloc_fragment([blk_m], dtype=dtype)
-            expand_max_idx = T.alloc_fragment([blk_m, N], "int32")
-            max_idx = T.alloc_fragment([blk_m], "int32")
+            expand_max_idx = T.alloc_fragment([blk_m, N], T.int32)
+            max_idx = T.alloc_fragment([blk_m], T.int32)
 
             T.copy(logits[bx * blk_m, 0], logits_frag)
 
@@ -43,15 +43,12 @@ def topk_kernel(
                 T.reduce_max(logits_frag, max_val, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j,
-                                                          expand_max_idx[i, j])
+                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j, expand_max_idx[i, j])
 
                 T.reduce_max(expand_max_idx, max_idx, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-
-                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0,
-                                                       logits_frag[i, j])
+                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0, logits_frag[i, j])
 
                 for i in T.Parallel(blk_m):
                     topk_gates[bx * blk_m + i, k] = max_val[i]
@@ -61,7 +58,6 @@ def topk_kernel(
 
 
 def ref_program(logits, top_k):
-
     top_k_gates, top_k_indices = logits.topk(top_k, dim=1)
 
     return top_k_gates, top_k_indices.to(torch.int32)
@@ -93,5 +89,29 @@ def main(argv=None):
     print(f"Tilelang latency: {tilelang_latency}")
 
 
+def run_regression_perf(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--M", type=int, default=320, help="num_tokens")
+    parser.add_argument("--N", type=int, default=128, help="num_experts")
+    parser.add_argument("--topk", type=int, default=6, help="topk")
+    parser.add_argument("--blk_m", type=int, default=64, help="blk_m")
+    # In benchmark mode, ignore process-wide sys.argv unless an explicit argv is provided.
+    args = parser.parse_args(argv or [])
+    M, N, topk, blk_m = args.M, args.N, args.topk, args.blk_m
+
+    logits = torch.rand((M, N), device="cuda", dtype=torch.float32)
+
+    kernel = tl_topk(M=M, N=N, topk=topk, blk_m=blk_m)
+    tl_gates, tl_indices = kernel(logits)
+
+    torch_gates, torch_indices = ref_program(logits, topk)
+
+    torch.testing.assert_close(tl_gates, torch_gates)
+    torch.testing.assert_close(tl_indices, torch_indices)
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/topk/regression_topk_tilelang.py b/examples/topk/regression_topk_tilelang.py
new file mode 100644
index 000000000..f59d866e8
--- /dev/null
+++ b/examples/topk/regression_topk_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_topk
+
+
+def regression_example_topk():
+    tilelang.testing.process_func(example_topk.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
new file mode 100644
index 000000000..8fa1eaf85
--- /dev/null
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -0,0 +1,61 @@
+import tilelang
+import tilelang.language as T
+
+
+# use pass_configs to enable layout visualization
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg",
+    },
+)
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def main():
+    kernel = matmul(128, 128, 128, 32, 32, 32)
+
+    import torch
+
+    a = torch.randn(128, 128).cuda().half()
+    b = torch.randn(128, 128).cuda().half()
+
+    c = kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All check passed.")
+
+    # print the layout visualization result and save figures to ./tmp.
+    """
+    C_local inferenced layout:
+    Shape: [32, 32] -> [8]
+    Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+    Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+    """
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 4a8f41ee4..155a45970 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,21 +9,23 @@
 
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
     h_dim = dim // 2
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             # smem_sQ
@@ -81,11 +83,6 @@ def flash_attn(
 
             cur_kv_head = hid // (kv_group_num // block_H)
 
-            T.annotate_layout({
-                O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
-                O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
-            })
-
             # barriers_Q
             q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
 
@@ -108,9 +105,9 @@ def flash_attn(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
 
@@ -123,25 +120,18 @@ def flash_attn(
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                 T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                 T.barrier_arrive(kv_shared_1_r_is_ready)
 
-                T.copy(K_pe[bid, block_N:2 * block_N, cur_kv_head, :], K_pe_shared_1)
+                T.copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1)
                 T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     T.barrier_wait(kv_shared_0_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_0_l,
-                        acc_s_0,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True, wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
@@ -161,8 +151,7 @@ def flash_attn(
                     for i, j in T.Parallel(block_H, block_N):
                         acc_s_0[i, j] = T.exp2(acc_s_0[i, j] * scale - scores_max[i] * scale)
                     for i in T.Parallel(block_H):
-                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale - scores_max[i] * scale)
 
                     T.reduce_sum(acc_s_0, scores_sum_0, dim=1)
 
@@ -182,9 +171,7 @@ def flash_attn(
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_0_l)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim], KV_shared_0_l)
                         T.barrier_arrive(kv_shared_0_l_is_ready)
 
                     # Step 11.
@@ -204,15 +191,10 @@ def flash_attn(
                     T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_1_l)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head, :],
-                            K_pe_shared_1)
+                        T.copy(K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :], K_pe_shared_1)
                         T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 T.copy(logsum_0, logsum)
@@ -221,8 +203,7 @@ def flash_attn(
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_l[i, j] /= logsum[i]
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid,
-                                          hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim])
 
             else:
                 T.copy(Q_pe_shared, Q_pe_local_1)
@@ -237,16 +218,9 @@ def flash_attn(
                 T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     # Step 2.
                     T.barrier_wait(kv_shared_1_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_1_l,
-                        acc_s_1,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
@@ -265,8 +239,7 @@ def flash_attn(
                     T.copy(scores_max_1, scores_max)
 
                     for i in T.Parallel(block_H):
-                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale - scores_max[i] * scale)
 
                     # Step 8.
                     for i, j in T.Parallel(block_H, block_N):
@@ -279,8 +252,7 @@ def flash_attn(
                         acc_o_r[i, j] = acc_o_r[i, j] * (scores_scale_0[i] * scores_scale_1[i])
 
                     for i in T.Parallel(block_H):
-                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[
-                            i] + scores_sum_1[i]
+                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[i] + scores_sum_1[i]
 
                     T.barrier_arrive(scale_1_ready_barrier)
 
@@ -291,9 +263,7 @@ def flash_attn(
                     T.barrier_arrive(s_shared_ready_barrier)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_1_r)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                         T.barrier_arrive(kv_shared_1_r_is_ready)
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
@@ -301,15 +271,10 @@ def flash_attn(
                     T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_0_r)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:], KV_shared_0_r)
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head, :],
-                            K_pe_shared_0)
+                        T.copy(K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :], K_pe_shared_0)
                         T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 T.barrier_wait(lse_0_ready_barrier, 0)
@@ -319,20 +284,7 @@ def flash_attn(
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_r[i, j] /= logsum[i]
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          h_dim:])
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:])
 
     return main_no_split
 
@@ -352,31 +304,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -399,12 +344,12 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index b738a4b9c..46ffb2d7b 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -1,14 +1,11 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 2
     mbarrier_list = [128, 128] * num_stages
 
@@ -25,27 +22,21 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             # create mbarrier for tma
-            T.create_list_of_mbarrier(mbarrier_list)
+            mbars = T.alloc_barrier(mbarrier_list)
 
             with T.ws(0):
                 T.clear(C_local)
 
             for ko in range(T.ceildiv(K, block_K)):
                 with T.ws(1):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages + num_stages,
-                        parity=((ko // num_stages) % num_stages) ^ 1)
-                    T.copy(A[by * block_M:(by + 1) * block_M, ko * block_K:(ko + 1) * block_K],
-                           A_shared[ko % num_stages, :, :])
-                    T.copy(B[ko * block_K:(ko + 1) * block_K, bx * block_N:(bx + 1) * block_N],
-                           B_shared[ko % num_stages, :, :])
-                    T.mbarrier_arrive(mbarrier=ko % num_stages)
+                    T.mbarrier_wait_parity(mbarrier=mbars[ko % num_stages + num_stages], parity=((ko // num_stages) % num_stages) ^ 1)
+                    T.copy(A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K], A_shared[ko % num_stages, :, :])
+                    T.copy(B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[ko % num_stages, :, :])
+                    T.mbarrier_arrive(mbarrier=mbars[ko % num_stages])
                 with T.ws(0):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
-                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :],
-                           C_local)
-                    T.mbarrier_arrive(mbarrier=ko % num_stages + num_stages)
+                    T.mbarrier_wait_parity(mbarrier=mbars[ko % num_stages], parity=(ko // num_stages) % num_stages)
+                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :], C_local)
+                    T.mbarrier_arrive(mbarrier=mbars[ko % num_stages + num_stages])
 
             with T.ws(0):
                 T.copy(C_local, C[by * block_M, bx * block_N])
@@ -54,7 +45,6 @@ def main(
 
 
 def main(M=16384, N=16384, K=16384):
-    tilelang.disable_cache()
     block_M = 128
     block_N = 128
     block_K = 64
@@ -89,5 +79,14 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+    jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index 9ba9f6816..b582ee74c 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -5,20 +5,12 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_0_gemm_1(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -82,5 +74,27 @@ def main(M=1024, N=1024, K=1024):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index faaf48c64..d6d243bb0 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -5,20 +5,12 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -83,5 +75,28 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index c91274540..5468aa6ea 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -5,26 +5,20 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    })
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+    },
+)
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     warp_group_num = 2
     threads = 128 * warp_group_num
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index 3b1d86719..54566b785 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -5,8 +5,7 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor[(M, K), dtype],
@@ -79,5 +78,28 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/regression_example_warp_specialize.py b/examples/warp_specialize/regression_example_warp_specialize.py
new file mode 100644
index 000000000..d5cd17d48
--- /dev/null
+++ b/examples/warp_specialize/regression_example_warp_specialize.py
@@ -0,0 +1,25 @@
+import tilelang.testing
+import example_warp_specialize_gemm_barrierpipe_stage2
+import example_warp_specialize_gemm_copy_0_gemm_1
+import example_warp_specialize_gemm_copy_1_gemm_0
+import example_warp_specialize_gemm_softpipe_stage2
+
+
+def regression_example_warp_specialize_gemm_barrierpipe_stage2():
+    tilelang.testing.process_func(example_warp_specialize_gemm_barrierpipe_stage2.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_copy_0_gemm_1():
+    tilelang.testing.process_func(example_warp_specialize_gemm_copy_0_gemm_1.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_copy_1_gemm_0():
+    tilelang.testing.process_func(example_warp_specialize_gemm_copy_1_gemm_0.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_softpipe_stage2():
+    tilelang.testing.process_func(example_warp_specialize_gemm_softpipe_stage2.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/format.sh b/format.sh
index e820b5886..3cc4390db 100755
--- a/format.sh
+++ b/format.sh
@@ -9,7 +9,7 @@
 #    bash format.sh --all
 #
 #
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# Ruff (format) + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 # You are encouraged to run this locally before pushing changes for review.
 
 # Cause the script to exit if a single command fails
diff --git a/images/MatmulExample.svg b/images/MatmulExample.svg
index 6e20daf55..294e8f631 100644
--- a/images/MatmulExample.svg
+++ b/images/MatmulExample.svg
@@ -1 +1 @@
-<svg width="5243" height="2012" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><clipPath id="clip1"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><image width="2741" height="1197" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAACrUAAAStCAYAAADgNkLIAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAACrWgAwAEAAAAAQAABK0AAAAAJ6/OxQAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHs3X2MXld5IPDr1F84W4aRUmeAxiATgSNjy+tptRPKpPHYwjOKZY+lick6iWOtE3nWbmviRTukWzSkKU0olI9q4Y9GSC0UVKVUabpICSsIu1A14Y8EiSwCKmBVoLCEaEOgiQp0yd4z5J3c97wf887M+77363cl79xz7r3nPOf3jNVU+/RxkrgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI5CzwSznvb3sCBAgQIEBg/QKz6RJb0z//Z/1LWYEAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gS+mH52/9o+9RUBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB9QuELq3Pv/Bn3/qXswIBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB1Qt8Nv2kUdSqW+vq/XxBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQEIENBYlDGAQIECBAgMDqBUJn1i9mP/ulK65KNmzZlp1yT4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKAUApeUIkpBEiBAgAABAu0EFuPJn//f78VTxgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRKIaCotRRpEiQBAgQIEGgRCF1aZ+PZ55/9YfL8T56Lp40JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFF5gY+EjFCABAgQIECDQTuBsu8kw9/++/ZVOj8wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJTAxivHl+PRqXWZwg0BAgQIECiNwFga6enSRCtQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAj0IKGrtAckrBAgQIECgYAILaTxbCxaTcAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisS0BR67r4fEyAAAECBIYuELq0zg99VxsSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGLCAotYBA1ueAAECBAj0WeCGdL2mLq2bt2xKRreP9HkbyxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYroCi1uF6240AAQIECKxHIBSzLsQLzJyaSubOHYmnjQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUSkBRa6nSJVgCBAgQqLnAfHr+saxB6NIaClpDYaturVkZ9wQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmUTUNRatoyJlwABAgTqLNC2S2soZm0Ut9YZx9kJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTKLaCotdz5Ez0BAgQI1EdgNj1qU5fWcPSD108uC+jWukzhhgABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoIQCilpLmDQhEyBAgEAtBRbjU09Mjyc7d+9YntatdZnCDQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQAkFFLWWMGlCJkCAAIHaCYQurfviU5+8eDyeSnRrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUBIBRa0lSZQwCRAgQKDWAhfi048f2NvUpbXxXLfWhoSfBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECZRNQ1Fq2jImXAAECBOomEDq0XhsfeuamA/HU8li31mUKNwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAiUSUNRaomQJlQABAgRqKbAYn3rn7h3JxPR4PL081q11mcINAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAiQQUtZYoWUIlQIAAgdoJhC6ts/GpT148Hk+1jHVrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUHABRa0FT5DwCBAgQKDWAmfj019+xWVdu7Q23tettSHhJwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQFkEFLWWJVPiJECAAIG6CYylBz4dH/romcPxVMexbq0daTwgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAooICi1gImRUgECBAgQCAVWEj/bM1KjG4fSUKhaq+Xbq29SnmPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECAAIFmgdCldb55Kknmzh1JQqHqai7dWlej5V0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIE8BRS15qlvbwIECBAg0F7ghnS6qUtrKGY99ObJ9m93mdWttQuORwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAoUSUNRaqHQIhgABAgQILBWzLsQOoePqpS/dFk/3NNattScmLxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQsoKg15wTYngABAgQIRALz6XgsO7febqvr/T4bi3sCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECgxJQ1DooWesSIECAAIG1CbTt0jq6fWRtq73wlW6t6+LzMQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBAEFLUOAdkWBAgQIECgR4HZ9L2mLq3hu4PXT/b4eefXdGvtbOMJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAMQQUtRYjD6IgQIAAAQJBYDFmmJgeT3bu3hFPr2msW+ua2HxEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAwJAFFrUOCtg0BAgQIEFhBIHRp3Re/c/Li8XhqzePQrfXomcNr/t6HBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYpoKh1kLrWJkCAAAECvQtciF8dP7C3b11aG2sfOjGZhOJWFwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGiCShqLVpGxEOAAAECdRQIHVqvjQ8+c9OBeGrd49HtI8nMqal1r2MBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAv0WUNTab1HrESBAgACB1Qssxp/s3L0jmZgej6f7Mp47d0S31r5IWoQAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCfAopa+6lpLQIECBAgsHqB0KV1Nv7s5MXj8VTfxrq19o3SQgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAn0UUNTaR0xLESBAgACBNQicjb+5/IrLBtaltbGXbq0NCT8JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKIqCotSiZEAcBAgQI1FFgLD306fjgR88cjqf6Ptatte+kFiRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFingKLWdQL6nAABAgQIrENgIf12a/b7YRab6taalXdPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECCQt4Ci1rwzYH8CBAgQqKtA6NI6Hx9+mIWmwyygjc9pTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCAWUNQaixgTIECAAIHhCNyQbtPUpXXzlk3JoTdPDmf3F3YZZhHtUA9mMwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgdIJKGotXcoETIAAAQIVEAjFrAvxOWZOTSWXvnRbPD3QsW6tA+W1OAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCoEFLWuAsurBAgQIECgTwLz6Tpj2bVCl9bQNTWPS7fWPNTtSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgEAsoao1FjAkQIECAwOAFWrq0HjwxmYSuqXlcurXmoW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBWEBRayxiTIAAAQIEBiswmy7f1KU1bDdz89Rgd11hdd1aVwDymAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOACiloHTmwDAgQIECDQJLDYNEoHE9Pjyc7dO+LpoY51ax0qt80IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaCChqbYNiigABAgQIDEggdGndF6998uLxeCqXsW6tubDblAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBA4AUBRa1+FQgQIECAwPAELsRb7bn6qty7tDZi0q21IeEnAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI1FEgdGi9Nj74sVvfFE/lOtatNVd+mxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEai2gqLXW6Xd4AgQIEBiiwGK8187dO5KJ6fF4Otexbq258tucAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBrAUWttU6/wxMgQIDAkARCl9bZeK+TF4/HU4UY69ZaiDQIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQOwFFrbVLuQMTIECAQA4Ct8R7ho6oRevS2ohRt9aGhJ8ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLDFFDUOkxtexEgQIBAHQXG0kPPxwcP3VCLfOnWWuTsiI0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUE2BjdU8llMRIECAAIHCCCykkWzNRlOGTqiNGB+491PZ0N0TIECAAAECBAYi8Pzzzw9kXYsSIECAAAECzQIbNmxonjCqrID/vqpsah2MAAECBAom4L+vCpYQ4RAgQIAAAQKVENCptRJpdAgCBAgQKKjAy9K42nZp3bxlU0FDfjEs3VpftHBHgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAweAFFrYM3tgMBAgQI1FfgdHr0pi6toZj1mmMTpRBpdGstRbCCJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL2AotbSp9ABCBAgQKCgAqGYdSGObebUVBKKRcty6dZalkyJkwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQfoGN5T+CExAgQIAAgUIKzKdRjWUjC11aQ5Foma5Gt9YH7v1UmcIWKwECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJ8Fnn/++T6vaDkCBAgQIECgncCGDRvaTddmTlFrbVLtoAQIECAwZIEL8X4HT0yWqktrI/6D108milobGn4SIECAAAECwxL47tvPDGsr+wxI4BV3fbhlZXltISndhLyWLmU9BSyvPTGV7qV2eS3dIQTcVwH/c7ivnLks1u7vtbzmkoq+biqvfeUszGLyWphU9DWQdnnt6wYWI0CAAAECBAgQWBK4hAMBAgQIECDQd4HZdMVXx6vO3DwVT5VivHP3jmRierwUsQqSAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgvAKKWsubO5ETIECAQHEFFuPQQlFoKA4t63Xy4vGyhi5uAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBkggoai1JooRJgAABAqURCF1a98XRlr0oVLfWOKPGBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC/RbY2O8FrUeAAAECBGoucDY+/56rryp1l9bGeUJh7qMPPdYY+kmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBzgff9w1trLlD+49/+2ve0HEJeW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgVoLhA6t07HAsVvfFE+VcqxbaynTJmgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGkEFLWWJlUCJUCAAIESCCzGMVatEDR0a3URIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGISAotZBqFqTAAECBOooELq0zsYHr1oRaNWKdON8GRMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQnoKg1P3s7EyBAgEC1BG6JjzO6fSSZmB6Pp0s/rlqhbukT4gAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgYoIKGqtSCIdgwABAgRyFRhLd5+PI5g7dySeqsRYt9ZKpNEhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKFE1DUWriUCIgAAQIESiiwkMa8NRt36NI6c2oqO1Wpe91aK5VOhyFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFEJAUWsh0iAIAgQIECixwMvS2Nt2ad28ZVOJj9U9dN1au/t4SoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsHoBRa2rN/MFAQIECBDICpxOB01dWkMx6zXHJrLvVPJet9ZKptWhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQK5CShqzY3exgQIECBQAYFQzLoQn2Pm1FQyun0knq7cWLfWyqXUgQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECuQooas2V3+YECBAgUHKB+TT+sewZQpfWuXNHslOVvtettdLpdTgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwFAFFLUOldtmBAgQIFAxgQvxeQ6emKxFl9bGuXVrbUj4SYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsF4BRa3rFfQ9AQIECNRVYDY9+Kvjw8/cPBVPVX6sW2vlU+yABAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGhCChqHQqzTQgQIECgggKL8ZkmpseT0Lm0bpdurXXLuPMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYjoKh1MK5WJUCAAIFqC4QurfviI9a5Y2mdzx7/HhgTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTUBR69rcfEWAAAEC9RY4Gx9/1/4ra9mlteGgW2tDwk8CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CihqXauc7wgQIECgrgKhQ+t0fPi589fFU7Ub69Zau5Q7MAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgrwKKWvvKaTECBAgQqIHAYnxGXUp/IcIh/s0wJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQWI2AotbVaHmXAAECBOouELq0zsYIOpS+KMLiRQt3BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxNQ1Lo6L28TIECAQL0FbomPP7p9JNl/YG88Xduxbq21Tb2DEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWLaCodd2EFiBAgACBmgiMpeecj886d+5IsnnLpni61mPdWmudfocnQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxZQFHrmul8SIAAAQI1E1hIz7s1e+bQpXXm1FR2yn0qoFurXwMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CChqXYuabwgQIECgbgIvSw/c0qX16JnDurR2+E04dP0bOzwxTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKC9gKLW9i5mCRAgQIBAVuB0Omjq0rp5y6bk0InJ7DvuMwIT0+NLHVszU24JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIdBVQ1NqVx0MCBAgQILBUzLoQO8ycmkpGt4/E08YZgZMXj2dGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0F1DU2t3HUwIECBAgMJ8SjGUZQpfWuXNHslPu2wjo1toGxRQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBHAUWtHWk8IECAAAECSwIXYodrjk3o0hqjdBjr1toBxjQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECLgKLWFhITBAgQIEBgWWA2vXv18uiFm6O3Ho6njDsI6NbaAcY0AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAi4Ci1hYSEwQIECBAYFlgcfnuhRtFmrHIymPdWlc28gYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECBNoLhC6t++JHCjRjkZXHCoFXNvIGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQICAola/AwQIECBAoJPA2fjBrv1XJjt374injXsQUAzcA5JXCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI1F9Cptea/AI5PgAABAm0FQofW6fjJ3Pnr4injHgV0a+0RymsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRoLKGqtcfIdnQABAgQ6CizGT0KH1lCY6Vq7gG6ta7fzJQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgDgKKWuuQZWckQIAAgdUIhC6ts/EHCjJjkdWPdWtdvZkvCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJ1ElDUWqdsOysBAgQI9CJwS/zS6PaRZP+BvfG08RoEFAevAc0nBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGaCChqrUmiHZMAAQIEehIYS9+aj9+cO3ck2bxlUzxtvAYB3VrXgOYTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBNBBS11iTRjkmAAAECPQkspG9tzb4ZurTOnJrKTrlfp4BuresE9DkBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoKICilormljHIkCAAIFVC4Ri1pYurUfPHNalddWU3T/QrbW7j6cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAgRigVDQ2tSldfOWTcmhE5Pxe8Z9ENCttQ+IliBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVExAUWvFEuo4BAgQILAmgVDMuhB/OXNqKhndPhJPG/dBQLfWPiBaggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQMQFFrRVLqOMQIECAwJoEQpfWseyXoUvr3Lkj2Sn3fRbQrbXPoJYjQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJRcQFFryRMofAIECBDoi8DZeJVrjk3o0hqj9HmsW2ufQS1HgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECi5gKLWkidQ+AQIECCwboHZdIVd8SpHbz0cTxkPQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUFIBRa0lTZywCRAgQKBvAovxSjqIxiKDG7MenK2VCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJlE1DUWraMiZcAAQIE+ikQurTuixfUPTQWGeyY92B9rU6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQIDAIgVviRXftvzLZuXtHPG08QAHdWgeIa2kCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQIkEFLWWKFlCJUCAAIG+CoQOraFTa9M1d/66prHBcAR0ax2Os10IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkUWUNRa5OyIjQABAgQGKbAYLx46tIauoa7hC+jWOnxzOxIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEiiagqLVoGREPAQIECAxDYFe6SUuXVt1Ch0HfeY+D1092fugJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA5QUUtVY+xQ5IgAABAm0EzsZzo9tHkv0H9sbTxkMUmDk1lYQ8uAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBOopoKi1nnl3agIECNRZYCw9/HwMMHfuSLJ5y6Z42niIAsE/5MFFgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQTwFFrfXMu1MTIECgzgIL6eG3ZgFCd9DQJdSVv4BurfnnQAQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgbwEFLXmJW9fAgQIEMhDIBSztnRpnblpSpfWPLLRZk/dWtugmCJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1ERAUWtNEu2YBAgQILAkEApam7q0hiLKmZt1aS3S74durUXKhlgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAsMTUNQ6PGs7ESBAgEC+AqGYdSEOQQFlLJL/WLfW/HMgAgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI5CEQurSOZTdWPJnVKNa9YuNi5UM0BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFhCChqHYayPQgQIECgCAJn4yAmpseT0e0j8bRxAQQUHBcgCUIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxZQFHrkMFtR4AAAQK5CMymu+6Kd547fySeMi6QgG6tBUqGUAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECQxBQ1DoEZFsQIECAQO4Ci3EEoUvrzt074mnjAgno1lqgZAiFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIDAEAUWtQ0C2BQECBAjkKhC6tO6LIzh58Xg8ZVxAAd1aC5gUIREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiSgqHVAsJYlQIAAgcII3BJHEjq06tIaqxRzrFtrMfMiKgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAIAQUtQ5C1ZoECBAgUBSB0KE1dGptunRpbeIo/EC31sKnSIAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgb4IKGrtC6NFCBAgQKCgAotxXKFD68T0eDxtXGAB3VoLnByhESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgT6KKCotY+YliJAgACBQgnsSqNp6dI6d/5IoYIUTG8CurX25uQtAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECZBRS1ljl7YidAgACBbgJn44ej20d0aY1RSjLWrbUkiRImAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGAdAopa14HnUwIECBAorMBYGtl8HN3cuSNJKI50lVNAt9Zy5k3UBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFeBRS19irlPQIECBAok8BCGuzWbMChS2soinSVV0C31vLmTuQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgV4EFLX2ouQdAgQIECiTQChmbenSOnPTlC6tZcpih1h1a+0AY5oAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUAEBRa0VSKIjECBAgECTQChoberSGjp8ztysS2uTUkkHurWWNHHCJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0IKCotQckrxAgQIBAaQRCMetCHK3unrFIucfyWe78iZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0ElAUWsnGfMECBAgUEaB0KV1LBu4zp5ZjWrcy2k18ugUBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFYQFFrLGJMgAABAmUWOBsHPzE9noxuH4mnjUsuoFtryRMofAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAGwFFrW1QTBEgQIBAKQVm06h3xZHPnT8STxlXQEC31gok0REIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABApGAotYIxJAAAQIESiuwGEceurTu3L0jnjauiIBurRVJpGMQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBF4QUNTqV4EAAQIEqiAQurTuiw9y8uLxeMq4QgKhW+s1xyYqdCJHIUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUW0BRa73z7/QECBCoisAt8UFCh1ZdWmOV6o3nzh1JQnGriwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPwCilrLn0MnIECAQN0FQofW0Km16dKltYmjsoPR7SPJzKmpyp7PwQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECdRJQ1FqnbDsrAQIEqimwGB8rdGidmB6Pp40rKqBba0UT61gECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA7QQUtdYu5Q5MgACBSgnsSk/T0qV17vyRSh3SYboL6Nba3cdTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBZBBS1liVT4iRAgACBdgJn48lQ4KhLa6xS/bFurdXPsRMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC1RdQ1Fr9HDshAQIEqiowlh5sPj6c4sZYpB5j3VrrkWenJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBKotoKi12vl1OgIECFRZYCE93NbsARU2ZjXqd6+guX45d2ICBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgWgKKWquVT6chQIBAXQRCMWtLl9aZm6aSzVs21cXAOSMBRc0RiCEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGQCilpLljDhEiBAgMCSQChoberSGopZZ26ewlNzAd1aa/4L4PgECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAqQUUtZY6fYInQIBALQVCMetCfPKZU1NJ6NTpqreAbq31zr/TEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAuUWUNRa7vyJngABAnUUCF1ax7IHD11aQ4dOF4EgoFur3wMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDkFFLWWM2+iJkCAQJ0FzsaH339gry6tMUqNx7q11jj5jk6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUWkBRa6nTJ3gCBAjUTmA2PfGu+NQnLx6Pp4xrLqBba81/ARyfAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQqK3AYnzyienxZOfuHfG0cc0FdGut+S+A4xMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKlFFDUWsq0CZoAAQK1FJhOT70vPvnc+eviKWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUWeBsfPjQoXXX/ivjaWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUVSB0aJ2ND3/y4vF4yphAk4BurU0cBgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBQgsoai10egRHgAABAi8ILMYSoUvrxPR4PG1MoElAt9YmDgMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQKEFFLUWOj2CI0CAAIFU4NXpn5YurUdvPQyHQE8CurX2xOQlAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEDuAopac0+BAAgQIEBgBYEL8fPQffOaYxPxtDGBtgK6tbZlMUmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQIJARGEvv5zPjpVudN2MR45UE/M6sJOQ5AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCB/AUWt+edABAQIECDQWWAhfbQ1+1jXzayG+14F/N70KuU9AgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEB+Aopa87O3MwECBAh0FwjFrKfjVw6dmEw2b9kUTxsTWFFAt9YVibxAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEMhVQFFrrvw2J0CAAIEuAvPps5dln4di1qNnDmen3BPoWUC31p6pvEiAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQyEVAUWsu7DYlQIAAgRUEQpfWhfidmVNTSShMdBFYq4BurWuV8x0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYPACiloHb2wHAgQIEFi9QOjSOpb9LHRpDQWJLgLrEQhF0RPT4+tZwrcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDElDUOiBYyxIgQIDAugRuib/ef2CvLq0xivGaBObOK45eE5yPCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDFlDUOmBgyxMgQIDAqgVm0y/2xV+dvHg8njImsCaBnbt36Na6JjkfESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQGK6CodbC+VidAgACB1Qssxp+Efy4+FCK6CPRLQJF0vyStQ4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDon4Ci1v5ZWokAAQIE1i8wnS7R0qV17vx161/ZCgQyArq1ZjDcEiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQKIqCotSCJEAYBAgQILAmcjR1C8eGu/VfG08YE1i2gW+u6CS1AgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOirgKLWvnJajAABAgTWIRA6tM7G3ys8jEWM+yWgW2u/JK1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiPgKLW/jhahQABAgTWL7AYL6HoMBYx7reAoul+i1qPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILB2AUWta7fzJQECBAj0T+DV6VItXVqP3nq4fztYiUAbAYXTbVBMESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRyEtiY0762JUCAAAECWYEL2UG4H90+klxzbCKeNibQd4HQrfXRhx7r+7oWJECAAAECBNYn8Iq7Pry+BXxdSAF5LWRa1h2UvK6bsJALyGsh0yIoAusS8Pd6XXyF/VheC5uadQUmr+viK+zH8lrY1AiMAAECBAgQIECgYAI6tRYsIcIhQIBADQXG0jPPx+eeO3ck2bxlUzxtTKDvArq19p3UggQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBNQkoal0Tm48IECBAoI8CC+laW7PrhS6tM6emslPuCQxUIHRrdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQroKg1X3+7EyBAoO4CoZj1dIxw6MSkLq0xivFABXRrHSivxQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECPQkoau2JyUsECBAgMCCB+XTdl2XX3rxlU3L0zOHslHsCQxHQrXUozDYhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHQU2NDxiQcECBAgQGCwAqFL6/9O/4xltzl22+HktnfcmJ1yT2BoAn9w5gPJow89NrT9bESAAAECBAj8QuD5559vofju28+0zJkol8Ar7vpwS8Dy2kJSugl5LV3KegpYXntiKt1L7fK6YYP/L4HSJXKNAfvvqzXCFfyzdn+v/fdVwZPWQ3jy2gNSCV+R1xImrYeQ2+W1bv991e6/MXqg8woBAgQIECCwSoG6/TdG4Nl45fiykk6tyxRuCBAgQGDIAqFLa1NBa+jSOnfuyJDDsB2BFwV0a33Rwh0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYNgCilqHLW4/AgQIEGgI3NK4afzcf2BvMrp9pDH0k8DQBXbu3pFMTL/4f/0z9ABsSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCosYCi1hon39EJECCQo8Bsuve+eH9dMmMR4zwE/B7moW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECeQgsxpuG7pihS6aLQN4CurXmnQH7EyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUV2FjXgzs3AQIECOQmMJ3u3NKlde78dbkFZGMCsUDo1vroQ4/F08YECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIZgff9w1szI7dlFLj9te9pCVteW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgRUFzsZvhM6Yu/ZfGU8bE8hNQLfW3OhtTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNRYQFFrjZPv6AQIEMhBIHRonY33DV0xXQSKJuD3smgZEQ8BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQdQFFrVXPsPMRIECgWAKLcTg6YsYixkUR8LtZlEyIgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKiLgKLWumTaOQkQIJC/wKvTEFq6tM7cPJV/ZCIg0EFAt9YOMKYJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgMQUNQ6AFRLEiBAgEBbgQvx7Oj2keTgicl42phAYQR0ay1MKgRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EBAUWsNkuyIBAgQKIDAWBrDfBzH3LkjyeYtm+JpYwKFEtCttVDpEAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQYQFFrRVOrqMRIECgQAILaSxbs/GELq0zp6ayU+4JFFIgdGvdtf/KQsYmKAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAlQQUtVYpm85CgACBYgqEYtbTcWjXHJvQpTVGMS6swNz56wobm8AIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAlURUNRalUw6BwECBIorMJ+G9rJseJu3bErmzh3JTrknUGiBienxJHRsdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxOQFHr4GytTIAAAQJJErq0LsQQM6emktHtI/G0MYFCC5y8eLzQ8QmOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFB2AUWtZc+g+AkQIFBsgdCldSwOUZfWWMS4DAK6tZYhS2IkQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEyiygqLXM2RM7AQIEii9wSxxiKAzUpTVWMS6LgG6tZcmUOAkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEyCihqLWPWxEyAAIFyCMymYe6LQ1UUGIsYl0lAt9YyZUusBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNkEFLWWLWPiJUCAQHkEFuNQFQTGIsZlFFCYXcasiZkAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAog4Ci1jJkSYwECBAon8B0GnJLl9Zjt76pfCcRMYFIQHF2BGJIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiTgKLWPkFahgABAgSaBM42jdLBzt07kj1XXxVPGxMopYBuraVMm6AJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBggsoai14goRHgACBEgqEDq2zcdyKAGMR4zIL6NZa5uyJnQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiqgKLWomZGXAQIECivwGIceujSGooAXQSqJKBQu0rZdBYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECgOgJj6VFaurTO3DxVnRM6CYEXBHRr9atAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOivgKLW/npajQABAnUXWIgBRrePJAdPTMbTxgQqIaBbayXS6BAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAQQQUtRYkEcIgQIBABQRCl9b5+Bxz544km7dsiqeNCVRCQLfWSqTRIQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGCCChqLUgihEGAAIEKCIQurVuz5whdWmdOTWWn3BOonIBurZVLqQMRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECOQkoas0J3rYECBComEAoZr0hPtM1xyZ0aY1RjCsnoFtr5VLqQAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBOAopac4K3LQECBComMJ+eZyx7ps1bNiVz545kp9wTqKyAbq2VTa2DESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkMUUNQ6RGxbESBAoKICoUvrQny2mVNTyej2kXjamEAlBXRrrWRaHYoAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYsoCi1iGD244AAQIVFDidnqmpS2s4oy6tQcFVJwHdWuuUbWclQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiGgqHUQqtYkQIBAvQTOxscNXSt1aY1VjKsuoFtr1TPsfAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCgBRS1DlrY+gQIEKi2wGx6vH3xEXWsjEWM6yLgd78umXZOAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYBACiloHoWpNAgQI1EdgMT6qbpWxiHGdBPz+1ynbzkqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0W0BRa79FrUeAAIH6CFybHrWlS+uxW99UHwEnJdBGQLfWNiimCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI9CChq7QHJKwQIECDQVuBCPLtz945kz9VXxdPGBGoloFtrrdLtsAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEAfBRS19hHTUgQIEKiRQOjQOhufV4fKWMS4rgL+LtQ1885NgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrEdAUet69HxLgACB+gosxkcPXVpDh0oXAQLJ0t+F0e0jKAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYhoKh1FVheJUCAAIElgbH0/23p0jpz8xQeAgQyAnPnjmRGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYSUNS6kpDnBAgQIBALLMQToSPlwROT8bQxgVoLzJyaSnRrrfWvgMMTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxRQ1LpKMK8TIECg5gKhS+t8bBA6Um7esimeNiZQa4Hwd0K31lr/Cjg8AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsEoBRa2rBPM6AQIEai4QurRuzRqETpShI6WLAIFWAd1aW03MECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6CShq7SRjngABAgRigVDMekM8ec2xCV1aYxRjAi8I6NbqV4EAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0LuAotberbxJgACBugvMpwBjWQQFe1kN9wTaC+jW2t7FLAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgFlDUGosYEyBAgEA7gdCldSF+oFgvFjEm0Cqg+LvVxAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoJ2AotZ2KuYIECBAIBY4nU40dWkNLxw9czj8cBEgsIKAAvAVgDwmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKQCilr9GhAgQIBALwJn45cmpseTy6+4LJ42JkCgjYBurW1QTBEgQIAAAQIECLksBUoAAEAASURBVBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEIgFFrRGIIQECBAi0CMymM/vi2ZMXj8dTxgQIdBHQrbULjkcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVRAUatfAwIECBBYSWAxfiF0ad25e0c8bUyAQBcB3Vq74HhEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEEgFFLX6NSBAgACBbgLXpg9burTO3HSg2zeeESDQQUC31g4wpgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqYCiVr8GBAgQINBN4EL8MHRoHT+wN542JkCgBwHdWntA8goBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQWwFFrbVNvYMTIEBgRYHQoXU2fuvkxePxlDEBAqsQ0K11FVheJUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGoloKi1Vul2WAIECKxKYDF+O3RpnZgej6eNCRBYhYBuravA8ioBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQKwFFrbVKt8MSIECgZ4Gx9M2WLq0Hr5/seQEvEiDQWUC31s42nhAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQL1FVDUWt/cOzkBAgS6CSzED0e3jyShEM9FgMD6BXRrXb+hFQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHqCShqrV5OnYgAAQLrFQhdWufjRebOHUlCIZ6LAIH+COjW2h9HqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLVEVDUWp1cOgkBAgT6JRC6tG7NLnbpS7fp0poFcU+gDwK6tfYB0RIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBApQQUtVYqnQ5DgACBdQuEYtYb4lUOvXlSl9YYxZhAHwR0a+0DoiUIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACByggoaq1MKh2EAAECfRGYT1cZy66km2RWwz2B/gr4+9VfT6sRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC5RZQ1Fru/ImeAAEC/RQIXVoX4gV1koxFjAn0V8Dfsf56Wo0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAor4Ci1vLmTuQECBDot8DpdMGmLq1hg6NnDocfLgIEBiSgW+uAYC1LgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIlE5AUWvpUiZgAgQIDEzgbLzyxPR4cvkVl8XTxgQI9FlAt9Y+g1qOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQ6LvAbLrivnjVkxePx1PGBAgMQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAonYCi1tKlTMAECBAYiMBivGro0rpz94542pgAgQEJHHrzZBKKW10ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAi8KXJvetnRpnbnpwItvuCNAYOACl750WzJzamrg+9iAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBUAUWtRc2MuAgQIDA8gQvxVqFD6/iBvfG0MQECAxaYO3dEt9YBG1ueAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKK6Aotbi5kZkBAgQGIZA6NA6G2908uLxeMqYAIEhCIxuH9GtdQjOtiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKKaCotZh5ERUBAgSGJbAYbxS6tE5Mj8fTxgQIDElAt9YhQduGAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQGJrAWLpTS5fWg9dPDi0AGxEg0CqgW2uriRkCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgHgKKWuuRZ6ckQIBAO4GFeFIxXSxiTCAfAd1a83G3KwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIJCvgKLWfP3tToAAgbwEQpfW+XhzhXSxiDGBfAQUmOfjblcCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgXwFFrfn6250AAQJ5CVxIN96a3fzSl25LZk5NZafcEyCQo4Ai8xzxbU2AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjkIqCoNRd2mxIgQCBXgVDMejqO4NCbJ5PNWzbF08YECOQkoFtrTvC2JUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBHITUNSaG72NCRAgkJvAfLrzWHb3UMwaukK6CBAoloBurcXKh2gIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBwQooah2sr9UJECBQNIHQpXUhDmrm1FQSukK6ii3w7I+eS5545CvLf55+8pliByy6dQvo1rpuQgsQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECJRLYWKJYhUqAAAEC6xe4IV2iqUtrWPLomcOrXvn9t9+bfPq+z6/6u+wHe66+amm4eeumZP9v7kn2vOGqZOfuHdlX3L8g8OhDjyUfvOPPkmwha+iwe8sdJ5Jjt60+f2DLIxC6tT74kYeTn/7kZ+UJWqQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTUIKGpdA5pPCBAgUGKBC3HsE9PjyeVXXBZPD2Ucuo42rsc++6Wl29CZMhTxhe6xoWjTlSTf+fr3kj8696GWosZQ5HjvOz6WXDqyLTl0YhJVRQUa3VofuPdTFT2hYxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEfiFwCQgCBAgQqI3AbHrSffFpT148Hk/lOg6dSEOh5n/8zYUkW/Saa1A5b/5I2qW1W5fOz//tF3KO0PaDFgiF3oq8B61sfQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCBvAUWteWfA/gQIEBiewGK8VejSunP3jni6EOPvf/upZPHG9ySNDq6FCCqnIJ78zlNdd/7+t3/Q9bmH5RdodGst/0mcgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQWWBj50eeECBAgECFBCbSs7R0aT10/RsLfcTQnfSdZz6Q3PmxtyZ7rr6q0LEOMrjX7X9N8uBHH+64xa79V3Z8lseDj//x/cnH33t/x60/+U8f6fjMg84CoVvrgx95uGvX3s5fe0KAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL6ATq3Fz5EICRAg0A+BhXiR0KE1dGot+hUKW//8D/+q6GEONL5rjk107KgbOnjecseJge5v8WII6NZajDyIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEBicgE6tg7O1MgECBIoiEDq0zsbBnLx4PJ7q6/hXr3x5MvorL+u45tM/+GHyna9/r+Pz7IOvPv715Jtf/lbHws7su2W+f/ZHz6Xn/Mfk2WeeS77/7aeSY7cdXjrO5i2bkvf8t8XkE//1k8kTj3w1+VrqsXP3q5JgHApaQ7Gjqx4CurXWI89OSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE6iqgqLWumXduAgTqJLAYHzYUQw66S+vc+SPJoROT8dZN41DE+YkPfjJ54N5PrfhPqj/40YeT8/ecbvq+KoNP3/f55NFPPZ48+tBjy0e6/IrLlotaw2QobD35nwZbiLy8uZvCCjS6tYa/My4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNUELqnagZyHAAECBJoExtLRdNNMOpi5aSqeymV86Uu3LXUafcv7bltx/ye/89SK75TthZ/+5GfJH537UPL+2+9tKmgt2znEO1yB0K01FDm7CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVE1DUWrWMOg8BAgSaBRbS4dbsVKPTY3Yu7/trjk0k4U+36/vf/kG3x6V8FgpaP/fAo6WMXdD5CRTx73B+GnYmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEqiSwsUqHcRYCBAgQaBIIXVrnm2bSQVG7PO65elfXAs8nv722Tq2PffZLydce/0by9A+eSb7z9e81cVx+xWXJ69N9x6/dm4RCwX5c30/jfPShx5Jnf/RcErrLhnG4do2/Jtm0eVPyq1e+PJmYHk+e+Puv9NydNazV7Z+bP3hiMgln6fUKDo+kMf4s7RT7tS9+I/npv/xs+dPNWzclr/u3aaxpJ9BQaNzLuh//4/uXvw83Tzzy1aZxPIjfbzw/dtvhJHTv7eUaVF5Xsm4X46fv+3zyT9/4XvLVx76xFPp//tC5vv0+dbIIf48f/MjDSej26yJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVEVAUWtVMukcBAgQaBW4kE41dWkN/2T5zKmp1jcLMBOKPbtd21dRtPn0k88kodDwob94eLmotN3aTzySLL0Xnu25+qrktjtvTHbu3tHu1RXnQsfVBz/62bSg8ytt383OhzxcOtJb8WZY7J+feS75+HubC0ezm+x5w64Vi09DsWaI8TP3/V3y1ce/nv285T4UjIbrz+++Lxk/sDc5eP0bu3bS7RZby+LpRKf3Q3Fut6LWYeR1JetsjA9+9OGls4S4stcwCk0b3Vq7FTtnY3JPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiDgKLWMmRJjAQIEFi9QChmPR1/FgpaQ0FlEa9GR9NOsV1+xa90etQ0H4pH/+A/fGCpU2rTgxUG4bvfedPvJTM3Ty0Vt/bqFAoa33/x3qRRCLrCNkuPQ9HjTzOFkGGvUCx5zbF/t/R85+5XdS3u7GWP7Dvf/PK3kneeeX/XAt/s+9n7cK7w5xMf/GRy9yd+t69xZfdZ6X7Yee0WT8jfh972Z8sF0d3eHeQz3VoHqWttAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAIA+BS/LY1J4ECBAgMHCB+XSHsewuoXAyFMEV9fpfK/yT9St1cg3nCl0r75i7e9UFrVmT0H1z8cb39LRGKGh969E7V1XQmt2rcX/NsYnk/D2nl7rFho6x3bqVNr7p9WfoynrH3B+uqaA1u0cojA3rhI6vw76GndeVzleEgtYQY6Nb60rxek6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQ6F0gdGldiF8PXVpDEVwRr9D5MnTi7HYdvH6y2+Olrpn3vuNjXd/p9WGI5X2339v19RDzO898oGuxaCgkDkWqu/Zf2XWtT9/3+aWC3K4vreFhKET93XUW+Wa3bRS2hrMP6wo2w8zrSud6/H98KfcOrdkYj545nB26J0CAAAECBAgQIECAAAECBAgQIECAAAECBAoqsGHDhkJF9pKXvCQpWkyFAhIMAQIECBAgkJvAxtx2tjEBAgQIDErghnThpi6tYaOZm6YGtd+61g3dThdvenfX4tCJ6fFk5+4dHfcJa9y72L2gNXR6HT+wN7n0l7ctrfO1L34jeeLvv5J0KtB89KHHktC1debm9m4PfuThJHRBbXeFYtZ/f/F4cv1vvdgZN3Q4DTGGIs1215/ffV8SOrb2s/A4xN/pfCGGUGx78MQbk0YX3J/+y8+Sx//nE8nnHng0CabtrlDY+vhnv5SEnAz6yiOvK51ppd+zlb7v9/PLr7hsKRfh99VFgAABAgQIECBAgAABAgQIECBAgAABAgQIFE8gFI4ePXo02bVrV/Knf/qnydNPP517kAcOHEhuvPHG5K//+q+TBx98MPd4BECAAAECBAgQyAooas1quCdAgEA1BC7ExwgFiI3CxfhZHuNQ4PnNL/9jWlT61eQzf/X5rgWtoUD0ZFog2u364B1/loQ1213h+1vuOJEcu621o+V3vv695P1pR9ZOxakff+/9adHnZBLWiK9PfOiT8dTy+LY7b2wphr30pduSt7zvtuSf0zjbFSCG4tNQ8JothF1ecI03oatopysU+N75F29teRzmg/cdc3+Y5uhbLc/DxOf+9gtNRa13f+KOpvc+c9/fdSzeDS/G7zc+jgt688hrI5ZOP9sVCYfi4E3p78jO1+9YKpr+NyO/KJzutEa/50O+2v1O9Xsf6xEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKrF7juuuuSd73rXckrX/nKZHR0NLn77ruTZ55p31xk9auv/ovf+I3fSO68884k/Ny3b1/yk5/8JHn44YdXv5AvCBAgQIAAAQIDElDUOiBYyxIgQCAngdl0333x3isVhcbv92McikXDn/VcoRD0zo+9tWuX1u9/+6muBX3tCkwbMYVC3//y4QvJ7xz+vbadSUOn0M+khaZxt9YnHvlK2/fDumHN+P3GfuHn3PnrOsYbCnz7WdQabDpdE4f3d3qUBPfgdsfc3W3fCQXJ2WvP1Vdlh0vFyk0T0SB+P3q8NMwjr+3i6DS3lOe0+3G/u+t22q/bfOhiHArXFbZ2U/KMAAECBAgQIECAAAECBAgQIECAAAECBAgMX2B6enqpiPV1r3vd0ua//du/nfz85z9fKnLNo7D1DW94Q3LXXXclk5OTS/GMj48n99xzT/K2t71NYevwfz3sSIAAAQIECHQQuKTDvGkCBAgQKKfAYhx2KHYLRW9lu8I/q373J343CV0wu12fe+DRjo/Dt90KTMOHoTvo3LkjHdf43ANfaHkWOrx2ulaKd+fuV3X6NHmySxFqx4/W+ODBj3b/v7jNO8488torZehm+yf//Q+Wuv/G3WV7XaPf7+VRuN7vM1iPAAECBAgQIECAAAECBAgQIECAAAECBAhUSeDw4cNLxauvf/3rl4+1bdu25MKFC0tFpCMjI8vzw7iZmJhYKmi99tprm7b79V//9aU4Dx482DRvQIAAAQIECBDIS0Cn1rzk7UuAAIH+C0ykS7Z0aT10/Rv7v9MAVwxFodM3H1jqgLk5/SfdV7q+8KnHO76y/9o9HZ9lH4QixXvf8bHs1PL91x7/ehL+yflsLE9+p3MH1Fe+5uXL37a7CeuEQsjQBTa+wj5hvl+FkqEwuFO31m9++VtLnViP3fqmpS6fcSyhW+sn/+kj8fTQxnnktZfDHToxmZy753TT70Mv3w36Hd1aBy1sfQIECBAgQIAAAQIECBDoJrBly5bkta99bbJ58+alzmPZdy+55JLkm9/8ZvL0009np90TIECAAAECBCotcOjQoaUOqHv37m05Zyhsfctb3rI0/653vSv54Q9/2PJOvydC4erv//7vJwcOHEg2bNjQsvyv/dqvLXds/cxnPtPy3AQBAgQIECBAYJgCilqHqW0vAgQIDFZgIV6+UegWzxd1HP5Z+pm0oDUUmWaLSDvFGwpAv5oWnXa69rxhV6dHTfPhn5IP+4Wi0vgKc6GwNcTWuLb98rbGbcvP5378XMtcPPHsM53fuXSk89rxOiuNX5cWCHcqag3fPvHIV5b+hALWq9OOvqEIOHwTimHzvPLKay9nPnjijT39bvayVr/fCd1aH33osX4vaz0CBAgQIECAAAECBAgQILCiwBVXXJH85V/+ZXL55Zcn//qv/7r8fiiYCEWtt912W/I3f/M3y/NuCBAgQIAAAQJVFQj/7TM1NbXU+XTfvpZeNMvH3rp163Jh6z333JM880xrM5Tll9d5Mz4+vlTQGgpt2xW0NpZvFLbecccdSShsff755xuP/CRAgAABAgQIDFVAUetQuW1GgACBgQmE/614Nl69bP8kebbI8tith5O53zrStYDw6R90/1/w75i7OyZZ0zgUhu65+sVPuxV9disiDSuEgs12xbPhWejQ2ksxb3i3l2vu/JHkcw88uuKrz/7oueTT931+6U94OZxvIi1yPXj9ZBIKo4d95ZXXYZ+z3/s1itgVtvZb1noECBAgQIAAAQIECBAgsJLAxo0bk5e//OXJ6Oho21df8pKXtJ03SYAAAQIECBComsDY2Fhy++23J/v371/xaI3C1lA8+u53v3sgne1DHKFD6+HDh7sWtDaCDe+Hd7/whS8kP/7xjxvTfhIgQIAAAQIEhipwyVB3sxkBAgQIDEpgMV44dB8NhYllvEKR5cffe3/yzjMf6FgAGs717I+eHcrxnkyLWrPXzt2vyg6b7kNhbihc7XR1KzL91de8otNna5oPRY4zN0+t+ttQmPvAvZ9KfudNv5ecmbiY3PuOj3Xt+LrqDVb4IK+8rhBWKR6XrZC9FKiCJECAAAECBAgQIECAAIEVBUIhxs9+1vov4DQ+/PnPf9649ZMAAQIECBAgUGmBp556Krn//vuT7373uz2dMxS2hiLYhYWFjv8HQj0t1Oal0Cn2zjvvTGZmZv4/e3cCHlV99n38npkkQAKBJCwJm8iiIIVXwVejVSqCIuojoIBeguJTpCLIIloRAVmE4gbutBWsguhzSW0t1YpY6gJPFX0EnxekQFmqIIYd2YIkJPOe+9AZM2eWTCaznJn5/q8rnXP+55z/8jkRKf64E1agVX/P9u6778prr70mJ07E57/BBVg2XQgggAACCCCAgFCplW8CBBBAIPkFCo0tXGPdRt+hNQ8zWseozbkGatue51/lU6twfrutxAyk7ti4M+QUaz9cL4vmLJUR04cEvK+6qqgBH4pCpycwHKgipgZadc3jnxrhN5Ou98357/j1ezr63Xm15zBqn6MfvUPKT5V7q7DWdGBPwHX54g9EK79WVz23puMHuj9R7zXQWpKtj2qtyfbGWC8CCCCAAAIIIIAAAggggAACCCCAAAIIIIBAKgmUlZXJ4sWLpaKiQh555BFp0aJFtdvzBFv1xsceeywqFVu7du1qBlqvu+66sAOty5cvl8mTJ8v69eurXTM3IIAAAggggAACsRQg1BpLXcZGAAEE4iMw0ZimbtWp9MfY9709waHWPt2k9+DLqy7L71hDrU/f+6KECrdqxdBegy4XDevZqfUd2lMChVp1jSuXrjaDuzcYIVV9F9o2r90ub77wjhHmLTXPrf/TrFVj6dazq7U7KucasC023kdtKq6WGcFYrZ675cvtMmPJ/VFZF4PERkCrtQb73ozNjIyKAAIIIIAAAggggAACCCCAQHILOOrmiKNufTESH8mzEbexVHeFVB7ZnzxrZqUIIIAAAgikiYAGW5csWWKGSTXY2rx59T+pLysrS8aPHy9aLfWJJ56oVbD1Jz/5iUyfPl1uuOGGsMR1zhUrVsjUqVMJtIYlxk0IIIAAAgggEGsBQq2xFmZ8BBBAILYCWqV1pHWKgaOul6w6mdZu251rUPXJt6eZP+Zeq7cGaxqmnPLSOL/L1e2xyyWd/J6JpKOpETi1tu5GAHXYpMFmVVbrNT3fvG6bbB61LdAlvz7dx+SXxsf0nWnlXP3SsOOaFevkU+MzWMDWb4FVOjzVc3XvsWqJfK+x2lM8x6Vaazy1mQsBBBBAAAEEEEAAAQQQQCAVBBz188TZoEDcZSeTZztOlziy6hFqTZ43xkoRQAABBNJMoLy8XF599VUz2Dpz5sywgq116tSRe++915R68skn5dChQzVW69SpkxloHTBgQFjPaqB15cqVZqD1yy+/DOsZbkIAAQQQQAABBGItQKg11sKMjwACCMRWQJOePlVaNRCY6CqtNdmyrveB+aPNYGuw59Z9uF4O7zvirXrquc9TBdVzbv0cMWNITCu8Drrnetn37QFZ/uoH1qnDPs/JzTb2Pyqm66y6GE+4Vau3Rhpw1eq5Nwzv4/c+qs5Tm+NEv9farN0uz1Kt1S5vgnUggAACCCCAAAIIIIAAAggkhYAjQ9ylR6Ri79dJsVxdpFaWdTU/J2nWy0IRQAABBBBIRwENti5evFjcbreEW7G1bt263mDr3Llz5eDBg2HTdezY0Qy03nTTTWE9o4HWDz74QCZPnixr164N6xluQgABBBBAAAEE4iHgjMckzIEAAgggEBMBDbPeYR1ZA63VVbq0PpPoc60sqZVPg7WyU+VGdVH//zPdtGWTYI+Y/d9u+y7k9WhcHP3oHdKjX3FEQ3Xs1l7mGpVqQ+09ooHDfEgDrhpufWPTb+TZ92eZlWdbti+q9ml9HyuXrq72vkhvsMN7jXTtdnnOU63VLuthHQgggAACCCCAAAIIIIAAAgjYWsDhtvXyAi7O4TC6k3DdATdDJwIIIIAAAqkr4KnYOnXqVCkpCf5TC6sKeIKt999/vxQUFFS9FPT43HPPNQOtgweH/5P2PvzwQzPQ+sUXXwQdlwsIIIAAAggggEAiBAi1JkKdORFAAIHoCIw0himsOpSGWQeOur5qV9Ic9xp0Wci1rlr2md/1+g2z/fqqdmz5cnvV06gfrzUqyN52wRhZtWxN2GNrZdbegy+XKS+NkyfffljCCZGGPXgtbtQQpFae/c3Hj5nr0vNQbff24H/wonsM1bTqbqiW6Pcaam3JdE2rtdIQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgsQKeiq2RBFvvu+++aoOtHTp0EB375ptvDnujH3/8sUyZMkU+//zzsJ/hRgQQQAABBBBAIF4CGfGaiHkQQAABBKIqoFVaJ1pH1Cqt1f3oduszdjmvrlrphk83yYmjpVI1MKnHXS7pJHotUNvx1c5A3X59Wnn0xrbD/fo9HSOmD5F+I/p4Ts3P3z//jiyas9SnT+2HTRpsVm7dsfEbKTfGrdq0AmmzVo2rdsXsWE02fLI54PjdenYRrRIbrOm1J40Ksnf/bKLs3XUg4G3B+vXmnGrCxuodqiXyvYZaV7Jd81RrXfOef5XjZNsL60UAAQQQQAABBBBAAAEEEEAg5gLGjwVOquauTKrlslgEEEAAAQTSXeD06dOyePFik+GRRx6RoqLqf3JenTp1ZPz48eYzc+fOlYMHD/oxtm3b1gy03nrrrX7XgnWsWrXKrNC6Zk34RVuCjUU/AggggAACCCAQCwFCrbFQZUwEEEAg9gK3GFP4VGnVKfsOvTL2M8doBg0yaghvx8bgQVStjNqjX7HPCnr0uzhoqFWDnfpMdYHZb7cFrzqqk1mrqW5et80v0Kr3PTB/lBmy1eNQoVG9Huu2ee12eX3eWwGnOXGstNr1adXfszufFTTUGnDgf3fmNWkY6rJsMfyqC/cm4r2GXHSSXtRqrYRak/TlsWwEEEAAAQQQQAABBBBAAIH4CtTNEWeu/nhfZ3znjWg2I4CbVc9Yqiuip3kIAQQQQAABBBIjoBVbFy1aJA6HQ2bOnBlWsLVevXpmsNVt/AUcDbYeOnTIu/g2bdqYgdahQ4eaY3ovhDj4+9//blZo1U8aAggggAACCCBgV4Fk+NMZu9qxLgQQQCCRAuOskxdf090vfGm9x+7nxX26h1ziuo82+F2v7pn5k16pNpgZLPzpmaytEe6s2tZ96L8Ovf63pf8th/cdqXprwo5DVezVoG911VJ14ft27Q+6/lCh1Ooqtb75wjvVOiXivQbdbBJf8FRrTeItsHQEEEAAAQQQQAABBBBAAAEEYi9QdlLMQq25TUQ02Gr7L+MnAdWrL5WnSmNvwwwIIIAAAgggEFUBrdj6yiuvyMMPPywlJaGLrngm9gRb77vvPsnLyzO7W7dubQZab7/99rADrZ9++qlZoXX16tWeoflEAAEEEEAAAQRsKUClVlu+FhaFAAIIhBTob1w933qHVmRM9tbl0o4i84LvQiuvWpuGN/uN6CPLFqywXjLP9+46IA8N+pWMmD5ENPhbtem1RXOWhqxkqaFAa0BUK50GaiuXrhb90vtbtmvuc0vbn7SWnAbZ3j4dV8OfXS7p5O2L5kGocbUy7ezhz8j4eSP89uZZg+4jVNXcc7u189zq96lVarXy7omjgZ103LF9pvgZaaVbj3Ui3qvfRlKkg2qtKfIi2QYCCCCAAAIIIIAAAgggYHMBrTiWrK3y+30i+kVDAAEEEEAAAQTiIOAJtupU4VZszc7Olnvvvdf4izhueeutt+QXv/iFDBs2TJzO8OqYffbZZ2ag9eOPP47DDpkCAQQQQAABBBConQCh1tr58TQCCCCQCIFp1kk1rKkhyWRv5xphSP2x98GqiGoIVQOZLdsX+Wx12KTBotVH9Vqgps/NMkKcGrT0VF09vP/7oPdXHSNQWLhLccegIVp9Vqu1Wiu2BgrkeubRAGqPfhdLr8GXm/v39NfmUyup6rjB5lWv2y4YY8xbbIRLfT03fLo56HO6Jn1HutZQrXvPrrJq2ZqgtwQysr73eL/XoItN8gueaq1r3lub5Dth+QgggAACCCCAAAIIIIAAAgjUXEDDtllZWZKZmSmVlZU1H8AGT5SWBv6LwzZYGktAAAEEEEAAgQgFPMFW/b3KjBkzpKjI97/VBBpWK7ZqsPWqq66S888/X1wuV6Db/PrWrl1rBlo//PBDv2t0IIAAAggggAACdhQg1GrHt8KaEEAAgeACxcYlvyqtvQddFvyJJLqiYUkNtgYLYupWNIxpDbXqc+OfGiEPDZwTNBCrz2rl0FBj6z1VmwYzrdVd9br2aRg0VGiz6jjVHeua9Ov1eW/JjCW/jFpAedhDg6o1iWQPGjZV81DthjuvrrVPvN9rqP0k+zWqtSb7G2T9CCCAAAIIIIAAAggggAACkQrk5OTIuHHjpFu3blJRURHpMAl9bvDgwQmdn8kRQAABBBBAIDYCGmx9+eWXzeqr06dPlxYtWlQ7kVZsveiii6q9z3ODBlofeugh+dvf/ubp4hMBBBBAAAEEELC9AKFW278iFogAAgj4CEz0OTNOPFUYrf3Jet7lko4hg6cb1myWfiP6+G1Pf+T9r96cJLONiqzWKql+N4fRoYHWyS+NC3rnA/NHiVZDXbZgRcggbdABAlzQdU8a+CuZ8+ZDUQm2qomGfZ++d0HU1tjbqNAayN+6HZ170D3Xy++ff8d6qUbn8X6vNVpcEt3s+XWCaq1J9NJYKgIIIIAAAggggAACCCCAQFQEtEprr169pGfPnlEZLxGDOBs28Z/W+NHD+uOH3ccO+l+jBwEEEEAAAQSSRsBTsVUXrBVbmzdvHrW1r1+/XqZOnSrvv/9+1MZkIAQQQAABBBBAIB4CznhMwhwIIIAAAlER0Aqt/a0jaQXGVGpdLu0YcjvrjEqt1h9T73lAA5DPrpglXS7p5OmK6FNDmxpora4a6cDR10svI+QZzabVZDXYGo1grq5LK8pq2Nda3bama85r2lBGP3qHGZIN91mt6Dpi+hDJyc0O95GA98X7vQZcRAp0ptqvFSnwStgCAggggAACCCCAAAIIIIBAHAQ0+Hnq1Kk4zBS7KRwNCsTvq1GhuIraxW5SRkYAAQQQQACBuAl4gq1arXX37t1RmXfDhg1mhdbly5dHZTwGQQABBBBAAAEE4ilApdZ4ajMXAgggUDuBadbHNahYfE13a3dSn59rBFM1TBosuKr9W9ZtCxpc1fDlHCPEudYIv/7t9/8tq5atCctD51RLDWJqBdbq2sqlq2XBtNdEQ6jRbjrm2o/Wi1ZFjUbTUOhvPn7MrIC7atlnpkm469aKtb0GXWaGYyNZiwaENVi7ZsVaY/7N5nsJd+6q88XrvVadM9WOqdaaam+U/SCAAAKpL9D8kZdSf5NpuEPea2q+dN4r7zU1BdhVTQU0PEqLjUDFd1v9BnbUrS+uwrZ+/aE6+PU6lE7yXuO9Ju+7C7Vy3msoneS9xntN3ncXj5VrsPXll182p9Jwa20qtv7jH/+QKVOmyF/+8pd4LJ05EEAAAQQQQACBqAsQao06KQMigAACMREoNEa9xjpy36FXWrvidn6mSullQedr2S6yH4+i4VKtLFpuhFeDtXDG1jCmfml10Q2fbJIdG3eaw2mwUltW3Uw594Iz1Sy69ewiGvwMt2lg9ul7FwS8XYPGU14aH7AyqlZf/Xb7d7J31wFZNGdpyGqsW9Zt94ZaPYHOgBManW07nxXskk+/VrDVL9Pk003m/N9uKzHv2fLldmnasrHkNWkoOp/uo2nLJmEFfH0mCXCi4/W97Urzq+plfScnjp7wdul91bVYvledO1rW1e0jUde1Wuua99YmanrmRQABBBBAAAEEEEAAAQQQQACBSAQqK/yfCtTnfxc9CCCAAAIIIJBEAlWDrZMnT5azzgrvv/94tqh/yeirr76SadOmyZ///GdPN58IIIAAAggggEDSCRBqTbpXxoIRQCBNBSYa+65bde9mUPD2xIVaNfSoX7FoNQmYVjd/Tm62WYE1WhVttcro46PmB5xWK7w++/4ss9JsoBv0nelXl0vErF469uop4gmVWu/XarSepkFfDaNGs0V7vEjWppVDI23Rfq+edcTC2jO2HT6p1mqHt8AaEEAAAQQQQAABBBBAAIHUEygrK5NQlVrLy4P/5eXU04jTjhyOOE3ENAgggAACCCAQTwENti5YsEByc3Nl6tSp0rBh9QVBPOvbv3+/PPPMM/LWW295uvhEAAEEEEAAAQSSUoBQa1K+NhaNAAJpJqBVWkda9zxw1PVBw5PWezmPnoBWfdVga6B2jVE5V0OR4TS9r8cNxfL6vMB/sBBsjnDG5h4EQgn0u/NqqrWGAuIaAggggAACCCCAAAIIIJDiAjk5OXLeeedJVlZWyCBquAwVFRXStm1bc7xgz/zkJz+RXbt2SWZm8D830VCsI4ygpt6j927btk327dsXbEpvv96ve065ZhjQEEAAAQQQQCA1BTTQWlBQUOPNZWRkSJs2bSQvL08OHz5c4+d5AAEEEEAAAQQQsIsAoVa7vAnWgQACCAQXGGdc8qnSqoHI3jdfHvwJrsRMYMOazUHHLj9Vs6ojJ44FDsfqBHlNGgWdhwsI1EZAq/RqxdYdG3fWZhieRQABBBBAAAEEEEAAAQQQSFIBDaD+/ve/l8JC/XvUtW8aMHU6naIhimBtypQpMmnSJDO0Giy4GqrSq3XcyspKufvuu2XRokXWS37nWiX2k08+kVOnTvld83Q4s+qJOI2wbNkPRpcNK6BmZHmWeuZTA60ZwQPCvjdzhgACCCCAAALJJKCVWe+//34ZNWpUjaq06h7z8/NlzJgx5u/L5s6dKwcOHEimrbNWBBBAAAEEEEDAKxD8T5m8t3CAAAIIIJBAAQ2z3mGdv+/tV4r++HVa/AUaNW4YdNK//X619Bp8uTRr1TjoPZ4LGihc+cZqz6nfZ5dLO/r10YFAtARunTBAZg1/JlrDMQ4CCCCAAAIxEXjqn/fHZFwGjZ/Avec86TcZ79WPJOk6Ar3X76YOT7p9sGBfgeaPvOTbYZzxXv1Ikq4j0HvVTWgANTs7W+rUqRO3PWmF1lBVWiNZSKgQbdXxjh07Jg8++GDVLr9jZ+NW4mzUVNynThqZVvuFWl3NO/iuWUOtLpfxVbP/xMO/h30Zk/Es0L+Hea/J+CZ91xzovfLvYV+jZDwL9O9h3msyvknfNQd6r7531O6sQYMGMn78eBk7dqxotdZImoZiR48ebVa212DrwYMHIxmmRs8E+nWsRgNwsy0FeK+2fC21XlSsfx2r9QIZICIB3mtEbDxkc4Ga/YmHzTfD8hBAAIEUFBhp7MmnbIZWaR046voU3GpybEkrXAZre3cdkPtvmCE3DO8j1wYJHmuYdfmrH8iqZWvkxNHAlVr1HesYNARiJVB8TXeqtcYKl3ERQAABBBBAAAEEEEAAAZsLaEXU06dP23yVoZdXUVEhWq01Wq3yyD5xlx6xZaA16B412KpfNAQQQAABBBBICYH69evLuHHj5N5774040OqB0HDsPffcY57OmzePiq0eGD4RQAABBBBAIGkECLUmzatioQggkIYCWqV1onXfWqU1r2nwaqHW+zmPrkD3nl1DhgEP7zsii+YsNb90Zq3amtekkWxety3shQybNJh3HLYWN0YqQLXWSOV4DgEEEEAAAQQQQAABBBBAIOUEyk+J2/iiIYAAAggggAACiRDQQKtWZ73vvvtEK61GoxFsjYYiYyCAAAIIIIBAogQItSZKnnkRQACB6gVuMW7xqdKqj/QadHn1T3JHTAXGP/ULmTTwV0ErrVadXKu36le4rd+IPqJfNARiLUC11lgLMz4CCCCAAAIIIIAAAggggIAdBDIzM+Wiiy6SZs2a2WE5Ea3hj3/8Y0TP8RACCCCAAAII2F8gJydHRo8eLffff780atQoqgvWYKuO7XA4ZO7cuVRsjaougyGAAAIIIIBALAUItcZSl7ERQACB2gmMsz7uCaFZ+zmPr0Dbzq3l2fdnyezhT8uOjTujMrlW3x0xfYj06FcclfEYBIFwBKjWGo4S9yCAAAIIIIAAAggggAACqSWgoYaMjOT+TwMul0ucTmdYL0bDHI8++qhcdtllYd1vx5v0ndEQQAABBBBAIPUEsrOz5e6775YHHnhA8vLywtrgt99+K++++65cffXV0qZNm2qfyc3NNYOteiPB1mq5uAEBBBBAAAEEbCKQ3H9yZRNEloEAAgjEQKC/Meb51nE1gEazh0CzVo3NYOua99bK6/PeijjcqgHZvrddKb0GXy5ZdTLtsTlWkTYCnqB8tMLZaQPHRhFAAAEEEEAAAQQQQACBJBaorKyU0tJSOXXqVFR24Xa7zYCpBmWDBU3Ly8uloqLCrBIWLKCp44TbdA+nT58O63Yd9/jx42Hdy00IIIAAAggggEC8BOrVqye/+IXxkwEnTZL8/Pywpt27d6/Mnj1b/vCHP8jnn38u06dPl5YtW1b7rKdiq/6+6KmnnpL9+/dX+0xNbvhu6vCa3M69NhRo/shLfqvivfqRJF0H7zXpXllYC+a9hsWUdDcFeq9Jt4koL5hQa5RBGQ4BBBCIksA06zjde3YVDUDS7CWgoUD9OnG0VDav3SZfrdlsfG4Pucgul3SUlu2L5Nxu7UXDsTQEEilAtdZE6jM3AggggAACCCCAAAIIIBB/gR07dsigQYMkKytLahIkDbZSDau2bdtWXnjhhaAVxmbNmiXLly+XzMzgf6FX1xIs8Fp1br1H7922bVvVbo4RQAABBBBAAIGkEahbt66MGDFCpkyZUqNAq/6e6uWXXzb/ctKrr75q/t7p4YcfllatWlW7dw223nPPPeYzGmzdt29ftc9wAwIIIIAAAgggkCgBQq2JkmdeBBBAILiAVmj1q9Lad2jP4E9wJeECObnZosFj/aIhkEwCVGtNprfFWhFAIG0FHMaP1nVXpu322TgCCCCAAAIIRFfgxIkT8j//8z9RHVRDEWVlZUHH/Oqrr6I+Z9DJuIAAAggggAACCNhYoE6dOjJ8+HCZOnWqFBQUhLVSrayqFVoXLlzorbavv/davHix+XwkwdZ58+YRbA1Ln5sQQAABBBBAIBECxn8ZoyGAAAII2EzAr0qrVmjV4BkNAQQQiIWAVmulIYAAAgjYV8DZqKk46of3Y+jsuwtWhgACCCCAAAKpLKBVX0NVWQ1VoTWVXdgbAggggAACCCBQVUB/z3THHXeYgdbGjcP7SX4HDhyQOXPmmIHWH374oepw5l8q0mDrzJkzZdeuXT7Xgp3Ur19fRo8eLRMmTJCmTZsGu41+BBBAAAEEEEAgoQKEWhPKz+QIIICAn4BWaO1v7SVwZhXhHAEEoingqdYazTEZCwEEEEAgigJGpVZX4dmi4VYaAggggAACCCCAAAIIIIAAAggggEDyCWigddiwYTJ9+nRp1qxZWBs4dOiQPProo/Lb3/5WTp48GfAZT8VWDbbu3Lkz4D3WTg223nPPPQRbrTCcI4AAAggggIBtBAi12uZVsBAEEEDAFJhodWjWqjFVWq0onCOAQNQFCM9HnZQBEUAAgagLOBu3EmdBi6iPy4AIIIAAAggggEAqC2gF2YYNG6byFtkbAggggAACCNhcQKvW33rrrWagtbCwMKzVHj58WB577DEz0FpaWhryGQ22vvrqq/LII4+EHWzNyckh2BpSlYsIIIAAAgggkEiBjEROztwIIIAAAj4C+v9i/aq03jC8j89NnCCAAAKxEPBUa92xMby/yR2LNTAmAggggED1As68QnFkZErF3q+rv5k7EEAAAQQQQAABBMyqZr/73e9k9erVaCCAAAIIIIAAAnEX0EDrzTffbAZOmzdvHtb8R44ckSeeeELmz58vx48fD+uZU6dOyeLFi8XtdsvDDz8srVu3rvY5DbaOHj3avG/evHmyb9++ap/hBgQQQAABBBBAIB4ChFrjocwcCCCAQHgCWqW1btVb85o2lL63X1m1i2MEEEAgZgJarXXW8GdiNj4DI4AAAghER8DRoEBcLiPYWrJdxF0ZnUEZBQEEEEAAAQQQSFEB/VG9CxcuTNHdsS0EEEAAAQQQsLNARkaGDBo0SGbNmiUtW7YMa6lHjx6VJ598Up5//vmwA62egT0VW51Op0ydOlVatWrluRT0s379+t5g61NPPSV79+4Nei8XEEAAAQQQQACBeAk44zUR8yCAAAIIhBTQKq0jrXcMHHW9ZNXJtHZzjgACCMREwFOtNSaDMygCCCCAQFQFHNm54mp5rojTFdVxGQwBBBBAAAEEEEAAAQQQQAABBBBAoPYCGmi96aabzEDrWWedFdaAx44dE62Y+txzz4keR9I02Lpo0SKzMuw333wT1hAabL3nnntk/Pjx0qRJk7Ce4SYEEEAAAQQQQCCWAoRaY6nL2AgggED4Ahpo9anSqmHW3jdfHv4I3IkAAghEQUCrtdIQQAABBOwrUFBY5F2co062EWztKI7MLG8fBwgggAACCCCAAAIIIIAAAggggAACiRXIzMyUAQOMn4xmVGg9++yzw1rM8ePHRSulPvPMM3LkyJGwngl2kwZbX3nlFXP+cIOtOTk5ZrD1vvvuk8LCQnE4HMGGpx8BBBBAAAEEEIi5AKHWmBMzAQIIIFCtgIZZ77Le1ff2KyUnN9vazTkCCCAQUwGqtcaUl8ERQACBWgtc1PtauXnML73jOLLqiqvFuaIBVxoCCCCAAAIIIIAAAggggAACCCCAQGIFXC6X/Md//IfMmTNH2rdvH9ZiTpw4IU8//bT59f3334f1THU3lZeXy+LFi2X27NkSbrDVU7FVq7Y2aNCguim4jgACCCCAAAIIxEyAUGvMaBkYAQQQCFtAq7QWVr1bq7QOHHV91S6OEUAAgbgJUK01btRMhAACCEQk8NPr+suwB2dIZta/K7RmZBnB1nPEUTcnovF4CAEEEEAAAQQQQAABBBBAAAEEEEAgOgLZ2dnSo0cPadeuXVgDlpaWyrPPPmtWaT18+HBYz4R7UyQVW+vUqSONGzeWLM+fO4U7GfchgAACCCCAAAJRFCDUGkVMhkIAAQQiENAqrROtz2mV1rymDa3dnCOAAAJxEaBaa1yYmQQBBBColUD3K3rLyFnzpF79f1fNcLrOBFtzGtVqXB5GAAEEEEAAAQQQQAABBBBAAAEEEIhc4Pjx47Jw4UL54x//KBUVFSEHOnnypDz//PMyd+5cOXToUMh7I72oFVsXLVoks2bNqrZiq977X//1X+Z6Dh48GOmUPIcAAggggAACCNRagFBrrQkZAAEEEKiVQH/jaZ8qrTpar0GX12pQHkYAAQRqK0C11toK8jwCCCAQe4EOXS+QsY8/J7n5+WcmczjFVdROnLkFsZ+cGRBAAAEEEEAAAQQQQAABBBBAAAEE/ATcbrd89dVXMnXqVHnrrbfk9OnTfvdohwZa58+fL08++aTEOkCqYdVXXnnFDLZ+/fXXAdej9yxdulQeeeQR2bp1q+g+aAgggAACCCCAQKIECLUmSp55EUAAgTMCflVaqZDItwYCCNhBgF+L7PAWWAMCCCBQvUCLth1kwlMvSkFhkfdmZ9M24sxv7j3nAAEEEEAAAQQQQAABBBBAAAEEEEAgvgL/+Mc/5OGHH5Zly5b5BVt/+OEH+e1vfyuPP/647N+/Py4L03CtBltnz54t1mCrXvvDH/7gDbTGZUFMggACCCCAAAIIhBAg1BoCh0sIIIBAjAW0Suv51jmojmgV4RwBBBIlwK9HiZJnXgQQQKBmAvnNiuSB518WDbh6mjO/SJxNWntO+UQAAQQQQAABBBBAAAEEEEAAAQQQiLPApk2bZNq0afKnP/3JG2zVQOuLL74ojz76qOzbty+uK/IEW2fNmiX/+te/zLkrKirMirIzZ86ULVu2xHU9TIYAAggggAACCAQTINQaTIZ+BBBAIPYC06xTdO/ZVdp2JnxgdeEcAQQSI0C11sS4MysCCCAQiUC9+g1kwtMvSqcLi72POxs2EVdROxEH/9ffi8IBAggggAACCCCAAAIIIIAAAgggEEeBjRs3msHWv/zlL3LkyBGzWqoGWvfu3RvHVfw4lSfYOmfOHNm6dau88847Mn36dNEALg0BBBBAAAEEELCLQIZdFsI6EEAAgTQT0AqtflVa+w7tmWYMbBcBBOwuoNVaZw1/xu7LZH0IIIAAAoZAZlaW3PnwHFn6/BPy2fvvmiaOnEbiat5eKkq2i1RW4IQAAggggAACCCCAAAIIIIAAAgggEGeBf/zjHzJjxgxZuXKlvP3221JSUhLnFfhOp9VZFy9eLLt27ZKdO3eKro+GAAIIIIAAAgjYSYBQq53eBmtBAIF0EvCr0qoVWrUqIg0BBBCwk4CnWuuOjTvttCzWggACCCAQRECDrUMmTJZ6OQ3ko7feMO9y1GsgrhbnSMV320QqyoM8STcCCCCAAAIIIIAAAggggAACCCCAQKwEvvzyS9Evu7RTp07Je++9Z5flsA4EEEAAAQQQQMBHgJ9B6MPBCQIIIBAXAa3Q2t86k1ZDpCGAAAJ2FLjhzj52XBZrQgABBBAIIXDjXWNFvzzNUSdbMlp1FEdWXU8XnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjYToBQq+1eCQtCAIE0EJho3WOzVo2p0mpF4RwBBGwj0KNfseQ1bWib9bAQBBBAAIHwBK4YcLMMvW+yaPVWs2VkiaulEWw1Aq40BBBAAAEEEEAgmgIOh0MyMzODDul08p8iguJwAQEEEEAAAQQQQAABBBBAAAEEEEDAR4A/SfLh4AQBBBCIuUChMYNfldYbhlMFMebyTIAAAhELZNXJlIGjro/4eR5EAAEEEEicwEVXXSt3Pjznx2Cr02UEW88VR3Zu4hbFzAgggAACCCCQcgKnT5+WkpISOXjwoOzdu9f7tW/fPjlw4ICcPHky5fbMhhBAAAEEEEAAAQQQQAABBBBAAAEEYiOQEZthGRUBBBBAIIiAVmn1+ZmvWv2w7+1XBrmd7mQXOHG0VHZs/Ma7jZbtmlPx0qvBQTIJ6K9Tb85/Rw7vO5JMy2atCCCAAAKGQKcLi2XC0y/Ksw+MkZPHj4k4nOIqaieV+3dK5dGDGCGAAAIIIIAAArUW2LVrl9xyyy2SZVSIr6ys9BlPq7Tu2LHDp48TBBBAAAEEEEAAAQQQQAABBBBAAAEEggkQag0mQz8CCCAQfQGt0jrSOqxWP9QqiLTYCTx97wJZuXR17Cb7e1lUAABAAElEQVQwRh7/1AjpPfhynznWvLdWXpj0ik8IUN/1sEmDpd+IwNV5dZ263mDtnd2Lg12iH4GYCniqtS6Y/lpM52FwBBBAAIHYCLRo20EmvvCyPDdxjBzcU2IGW51N24gYlVsrv98Xm0kZFQEEEEAAAQTSRuDUqVOyYcOGtNkvG0UAAQQQQAABBBBAAAEEEEAAAQQQiJ2AM3ZDMzICCCCAgEVAA60+VVo1JNb7Zt8gpOUZTpNU4NttJfL4qPk+gVbdStmpctFQYKxDtknKxrJtLqDVWrW6NA0BBBBAIDkF8psVyb3zXhQNuHqas3Er0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYQINRqh7fAGhBAIB0ENMx6l3WjGhDLyc22dnOeAgKfGlVaNcAarK3+82fBLtGPgG0FPNVabbtAFoYAAgggUK1Abn6+jH38OenQ9QLvvc5GTcXVrI1ZvdXbyQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCRAg1JoAdKZEAIG0FNAqrYVVd044rKpG6h3v+/ZAyE3t3bU/5HUuImBXAaq12vXNsC4EEEAgfIF69RvIyFnzpPsVvb0PORoUiKuoHcFWrwgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACiRAg1JoIdeZEAIF0E9AqrROtm+41+HJ+jLcVJYXOz+1mhEJCtI7d2oe4yiUE7CtAIN++74aVIYAAAjURyMzKkmEPzpCfXtff+5gjO1dcLc8Vcbq8fRwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggEE8BQq3x1GYuBBBIVwFNCvhUaVWIvrddma4eabHvHv2KpW3n1gH3mte0oQybNDjgNToRSAYBqrUmw1tijQgggEB4AjeP+aX0HTrce7OjTrZktD5PHJlZ3j4OEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiXQEa8JmIeBBBAII0F/Kq0Fl/TPWjgMY2dErb1lu2LJK9Jo1rNn9ekoc/zWs3yybenyZvPvyMbPt0sW9ZtM975WaJzaaBVg600BJJVwFOtdcH015J1C6wbAQQQQKCKQN+hP5eCZoWyZO7sM70ZWUbF1o5S8d02cZ8qrXInhwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjEVoBQa2x9GR0BBBDQKq3nWxlunTDA2sV5AgUGjr5eeg++POor0ODfrffxrqMOy4C2ENBqrW/Of0cO7ztii/WwCAQQQACB2glcdNW10iAvXxbOnCTlZWUirkxxtThHKkq2i/vksdoNztMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIhCngDPM+bkMAAQQQiExgmvWxLpd0okqrFYVzBBBIOgFPtdakWzgLRgABBBAIKtDpwmIZ8/jzUq9+gzP3OF3iat5eHPXzgz7DBQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAgj4CmiFVr8qrf3uvNr3Ls4QQACBJBXQaq15TRsm6epZNgIIIIBAIIE2HTvL2Mefk4LCojOXHU5xFZ4tzoZNAt1OHwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRFciI6mgMhgACCCBQVcCvSmvbzq2l+JruVe/hOEUFThwtlWULVgTdXa/Bl0uzVo2DXo/VhbUfrpct67bL4f1H5NttJT7T6Hp+cklH6X5F16gFFXWOT99bK+WnymXLl9ul7Idy75xZdTPl3AvaSWadTOnRrzghHt7FcBCRgKda64Lpr0X0PA8hgAACCIQp4Irv/3Vv0baDjHnsOVkwY5Ls3rHVXKSzSWsRV6ZUHvouzEVzGwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI1F4jvfxmr+fp4AgEEEEhWAa3Q2t+6+FsnDLB2cZ6iAsePlMrr894Kursul3aMW4jz8L4jsnLpanlvyQeyd9eBoGva8KmY9+kNXS7pJCNmDBENYte0aaB31bI18rel/y2b120L+biGbLUtmrNUuvfsKr0GXWYGXEM+xEVbCWi11jfnvyP6fUZDAAEEEIiCgNMljro5xld9cdTTrwZRGLTmQ+Q3KzIrtv56yn3y9eaN5gDO/CJxZGZJxd6vaz4gTyCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQhoAzjHu4BQEEEECg5gITrY/oj+imSqtVhfNYC2z4dJOM/NlEMzQaKtBqXYc+N/bqKfLCg69ImVFlNdy2Y+NO73PVBVqtY2rA9fFR883nNRhLSw4BT7XW5Fgtq0QAAQTsJ+DIqivO3ALRSqiu1p0lo+354mreQcwAaYICrR6levUbyJjHn5dOFxZ7usTRoEBcRe1EHPxxgheFAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgagJ8F+hokbJQAgggIBXoNA48qvSOnDU9d4bOEAgHgLLFqyQSQPnSG0Costf/UCmDXkyrDE0xDpp4K9CVoMNZ98ajNVxarPucObhnugJaLVWDe7TEEAAAQSqF9AqrM68QjMYmnF2VzPI6mzaRpwNm4gGXAO13Px86X5Fb7nxrrFy8VV9A90Ss77MrCy5e9Zcufjqa71zOHIaiavFOSJGVVkaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtEUyIjmYIyFAAIIIGAKaJVWn0SChr009EVDIF4CK5eulgXTX4vKdFq19al7F8iUl8YFHU+DqA8ZAdqaVHUNOphxwRNsffLtaaKVQGn2FvBUa43W95y9d8vqEEAAgRoIGKFPDbE6jIqrns9wnm7RtoO06dRZ2ne5QM42PvObFYXzWEzvGTJhsuQ3LZLlS14y59H9aLC1cs92cZeXxXRuBkcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEgfAUKt6fOu2SkCCMRHQKu0jrROpVVaCeZZVexzvmbFOtm360BEC2rbubUUX9M9omdj9dDhfUdkwbTQgdaW7Yuke8+uktMg21zGli+3y4ZPNgUNpa55b61o1da+twUOZ+u1UIHWjt3aS6/Bl4nOq63sh3JZ9/EGWbVsjeh6AzUNtq77cL3tfAOtlT4xg/tvzn8n6PvECAEEEEgXAUdmljiyG4pWM3Vk51a7ba2E2qajEWDt2s34PM/81D47tr5Dfy65efnyxnNPmMtz1Mk2gq3nSsV3W8Vd9oMdl8yaEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEkkyAUGuSvTCWiwACthfQQKtPlVYNs/boV2z7hafzAjWwqV+RtN6DL7dd6PKFSa/IiaOlAbej34/DJg2WfiP6+F3/dluJPG1UZN28bpvfNe14fd5bRjD18oAB7XUfrQ/4jHZqeHbGkvv9rmv/rRMGyKSBvzIrs/rdYHSs+vNntvMNtE76xPy+0AA/1Vr5bkAAgXQU0HCno37emSBrls9vBf04CgqLzBDrWeeeJx3+TzfRqqzJ1H56XX+pm1Pf+H3BbCkvMyq0ZmSJq2XHM8HWH04k01ZYKwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgA0FCLXa8KWwJAQQSFoBTTDcZV1939uvlLymDa3dnCMQE4G9RsXZUAHdETOGBK22qlVUJ780Tsb2mRKw2qZWVP3b0tUBn9d5g7XiPt2CXZKc3GzRNU0aOCfgPTs2fhOwn057Cuivd1Rrtee7YVUIIBBlAYfTrMLqrP/vaqyuzKATaGhVw6saYj27U2fJb3amannQB5LgQvcrektufoEsnDlJTh4/JuJ0GRVbz5GKku3iLj2aBDtgiQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYVcNp1YawLAQQQSEIBrdJaWHXdWhVTKxfSEIiXwKpla4JO1bFb+4CB1KoPaAA71PfsqmWfVb09rOPlr34Q8r62nc8Ken1fiLBs0Ie4kDABfs1LGD0TI4BAPASM4KqzYRNxFbWTjLb/x/x0NCgQsQRaM7OypOulPWTofZNl1utvy8T5r8iNd40VDYKmQqDVQ92h6wUy9vHnjHBr/pkuI+jrat5BnLmGCQ0BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBCAWo1BohHI8hgAACFgGt0jrR0mf+qHaqtFpVOI+lwGcr1gUdvtsVXYJeq3qhe8+uQX+E/JZ126TsVLn5o+arPtOsVWMJVq11x8adZiXWfndeLcXXdK/6mHms1Vrf2b3Yr5+O5BSgWmtyvjdWjQACgQUcdbLNiqyO+nmix8FaQWGRdOx+sXQyvjTQmi5Nq9BOeOpFeW7iGDm4p8TctrNpGzPoW3l4T7owsE8EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEoChBqjSImQyGAQFoL9Dd271OlVTX63nZlWqOw+fgKHN53RDYbodNgrculHYNd8ulv2b7IDK1qeNXatE+DrV0u6eRz6VyjCmywUKveuOHTTeaXBlgvMYKtGrDVZzQMS0stAU+11gXTX0utjbEbBBBIGwFHVl1x5DYRZ/1GIhlZQfetgc5uP+sl5/3fYtHjdG1affaB51+WZx8YI7t3bDUZnAUtTLvK/TvTlYV9I4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRChAqDVCOB5DAAEELAJ+VVq1ImXbzq0tt3GKQOwEDu8/EnLwSQPnhLwe7kUNr3a5xPfugaOvl1XL1vh2Bjg7cbRUVi5dbX7pZQ216j8rvQZdzj8vAbyStYtqrcn65lg3Amks4HSJs0G+EWZtHLQia2ZWlnS6sFi6XnK5UZW1WHLz89MYzHfr9eo3kAlPvygLZ06STV+c+f2As2ETcbhcUrH3GxF3pe8DnCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQRIBQaxAYuhFAAIEaCGiV1vOt9986YYC1i3ObCox/aoT0Hny5TVcX/rJOHD0R/s21uHOfEWq1Ng1wa2Xi5a9+YL0U8lwDsssWrDC/PAHXG4b3oYJrSDX7X9RqrX2HXimvz3vL/otlhQggkNYCjpxGRpg1Txz1AwdUCwqLjADrxdLJ+Op6aY+0tqpu8xr6vfPhObL0+Sfks/ffNW9XV5czQyr27BCprKhuCK4jgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggIIRa+SZAAAEEai8wzTqE/mh2qrRaVTiPtYAGRBPZRj96h5SfKvdWYa3pWjwB1+WLPxCt/DrwnutFw5G05BTQkPObL7wjZcb3BA0BBBCwk4Ajq65RkbWJGWYVl/+/ZzScqQHWi3r3NSuz2mntdl+L2g2ZMFnq5TSQj956w1yuIztXXC3OkYrd/yTYavcXyPoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAARsIEGq1wUtgCQggkNQCWqHVr0prvzuvTupNsXgEIhXQqrfFfbrJgumvSaQhWw1BaoXPLV9ulxlL7o90KTyXYIG8pg2l7+1XmlV4E7wUpkcAAQREnC4jxJpvhFkbi6NOdkCRNh07y8VXXSvdftZL6tVvEPAeOsMTuPGusZLftJn88bfPmg+oeUbr84xg6xZxl5eFNwh3IYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIpKUAoda0fO1sGgEEoijgV6VVK7QWX9M9ilMwFALhCVRX1VQrCEejNW3VOOQw+v2vX2veWytrVqyTT43PE0dLQz4T6OLaD9fLojlLZdikwYEu05cEAgNHXS9aeZdqrUnwslgiAikq4MhpZFZkddTPD7jD3Px8I8h6nVmVtVmrswLeQ2dkAlcMuFmyjXDwG889IeVlRpA1I0tcrTTY+k9xn6r57wsiWwVPIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIJJsAodZke2OsFwEE7CSgFVr7Wxd064QB1i7OEYiLgFbGDNVGzBgiGrqOV/OEW7V6a6QB12ULVsgNw/tIdXuL156Yp2YCVGutmRd3I4BAdAQcmVniaNjMDLOKKzPgoN2v6C0X9OglXS/tEfA6ndERuMiofNsgL18Wzpx0JthqVMx1tThHKvbsEHfp0ehMwigIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIpJeBMqd2wGQQQQCC+AuOs02mAiyqtVhXO4yXQtGWTkFN9u+27kNdjeVH/udBw6xubfiPPvj/LrL7asn1RtVNqhc+VS1dXex832FdAq7VWV0XYvqtnZQggkEwCjro54ipqJ66zuoizUVOxBlpbtO0gN4/5pTz25nsy7MEZBFrj9HI7XVgsE55+UeoZVVvNpsFW4z0Fq54bp2UxDQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgE0FCLXa9MWwLAQQsL1AobHCW6yr1PAWDYFYChzedyTo8PUbZge9phe2fLk95PV4XdRqsYPuuV5+8/Fj8uTbD1dbPXb39pJ4LY15YiDgqdYag6EZEgEEEDAFHNm5ZvVPV8uO4shp5KOSm58vP72uv0xe8LpMnP+KeewNV/rcyUksBTRQPPGFl6Wg8N9/ocXhFFfh2WfCx7GcmLERQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSSTiAj6VbMghFAAAF7CEw0llG36lIIblXV4DhSgbwmDUM+enj/EdHvtUAtJzdbulzSSTZ8uinQZdnx1c6A/dZOrY56Y9vh1m7v+YjpQ6TfiD7ec51vwyebvedVD7r17CIdu7Wv2uVzrNeefHua3P2zibJ31wGfa56TYP2e63zaX0AD/8sXfyD6vUVDAAEEoiXgzC0QR8Om4qjj/5c6tDroRb37SvcrekdrOsappUB+syK5d96L8uspE2T3jq3maM7GrcyKupUHd9dydB5HAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBIFQFCranyJtkHAgjEU0CrtI60TsiP2LaKcB6JQFbdzJCPbVm3LWRl0x79Lg4aatXw6doP10v3nl1DzvHtttCVUVu2/3eVtX+Psnntdnl93lsBxzxxrDRkqFUf0h9Nf3bns4KGWgMOTGdSCXhC/8sWrEiqdbNYBBCwoYBR4dPZsLFR4bOZSEaW3wIvvvpauaL/YNHKoDT7CWjl3LGPPycLZ06Sreu/NBfozCsUR0bo3//YbyesCAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIFYCzlgNzLgIIIBACgvcYezNp0qrhvJ69CtO4S2ztXgJNG3ZJORUyxauCBn+LO7TPeTz8ye9EvJ5fThYQNUzcFsjgFq1Bascq/doiDac6pz7du2vOqTPcbNWjX3OOUlOAYL/yfneWDUCthFwZYozv7lknN1VzOqeVQKtmVlZcsWAm2X6ojdlyITJBFpt89ICL6Re/QYyctY8nyq6jgYFcv3wsfLDqbLAD9GLAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQNgKEWtPmVbNRBBCIkoCGWcdZx+p7+5VBfyS89V7OEQgloAHOnFz/H6PseUarqI69eopMGjjH+3V43xHPZfP7sN+IPt5z68HeXQfkoUG/kjXvrbVeMsOuj4+aH/Ca5+a2nVv7fa93uaST57Lfp6539vBnpOoarTetXLpadmzcae32np/brZ33mIPkFfBUa03eHbByBBBIhIAjM0ucTVpLRpufGKFWo1K40+Vdhlb97Dt0uMx6/c9y411jRX+8PS05BDSIPOzBGfLT6/p7F7xi1Sdy2aA75Pujx7x9HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCKSfQEb6bZkdI4AAArUSGGk8XVh1BK3SqhUIaQhES+CSa7qLBj2DtRNHS2XDp5u8l62VUIdNGmxWSNVAaaCmwdZZRtBUw7OeqquH938vwe6vOsatEwZUPTWPNYirwdaqa6p6k1Zrve2CMWY145btfANHGz7dHPQ5HUP/+eo1+PKqw3GcxAL6a+XyxR+EVb03ibfJ0hFAIAoCjjrZ4mzUVLSCp7UVFBZJ70FD5aKrrhUNR9KSV+DmMb+U3LwCWb7kJXMT/2/TP41g63/KX373rJzVonnyboyVI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRCxAqDViOh5EAIE0FNAqrROt+9bAXagfv269n3MEqhO45raeIUOt1T2vQdDxT42Qh4xqrtbAa9VnreHYqtcCHXfv2VWKjcBtoDbsoUHVzrdq2ZpAj4bs04Cu7oeWGgKeaq3LFqxIjQ2xCwQQiLqAGWY1KrI6chr5jd2ibQfpPXioz4+t97uJjqQT6Dv051LQrFCWzJ1trn3Ljq+l99CR8ub8J0V+LMybdPtiwQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAApEJOCN7jKcQQACBtBS4xti1T5VWVeh725VpicGmYyfQsVt7GXRP7ar/6hi/enNS1ALXGmid/NK4oJvW+TRIG80Aam8jMN5vRJ+gc3IhOQW0Wms0v0+SU4FVI4CAn4ArU5xNWourVSe/QGunC4tlzOPPy8T5rxBo9YNLjY4LftZb2nTs7N3M199+J3v2H/Cec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJA+AoRa0+dds1MEEKi9wDTrEFq1sm3n1tZuzhGotYBWKB396B2Sk5sd8VgaNH12xSzpckmniMfQBzVYqoHW6oKIPfoVm0Halu2LajWfVvPUvWtIlpZ6Ap5qram3M3aEAAIRCThd4sxvLhltfiLOhk18hrj46mvNIOvds+ZKh64X+FzjJLUEXp83W77evNG7qZcemyZ9elzqPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgfQQy0mer7BQBBBColUB/4+nzrSPcOmGAtYtzBKImoFWAi/t0lzUr1sqGTzfL2g/Xy4mjpTUaXwOEc4yKrfrs337/37Jq2ZqwntcAq4a2NVzbrFXjsJ7RmzRI+5uPHzPWu8mY6zNzvnDXrNVgew26TDQcS0ttAa3WunzxB1J2qjy1N8ruEEAgpICzUVNx5hlF8I0qrVWbBlhvHDlOWrTtULWb4xQVeOO5J2TtRyu9u5t9/z1y+43/4T3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIL0ECLWm1/tmtwggELnAROujWv2SKq1WFXueDxx9vfQafFnQxbVs1zzotUgveMKkwZ5v2/msYJd8+s2qlka4VQOuVduOjTuNgOsJs0vvqa5pYFS/tALqhk82iT6vTcOy2rLqZsq5F7Qzj7v17GKGU82TCP9H//nQL3M+I+B6eN8R+XZbiTnali+3S9OWjSWvSUPRtWtl16Ytm9QoPBvhsnjMJgLm9/XtV8qyBStssiKWgQAC8RRw5DQSV5NWIhlZPtNqiPWGn4+UThfylxt8YFL4ZPmS38nf//In7w4fuOsO0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA+goQak3fd8/OEUAgfAGt0OqXruh359Xhj8CdCRXQ0KR+xbNppVMNdcaqRRqozsnNNiuwahXWeLVYOsRrD8wTfQGqtUbflBERsLuAo26OOAtaiKNeA5+l5ubnyw3/ebdcdNW1Pv2cpLbA5399V5Yvecm7yWE3/YdolVYaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAegsQak3v98/uEUAgPIFp1ts0UBjPUKB1fs4RQACBZBegWmuyv0HWj0D4Ao7MLHE2biVaobVqq1e/gVzRf7D0HjxUMrN8q7ZWvY/j1BNY/8kqWTJ3tndj7tKj8vyMSd5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBNJXgFBr+r57do4AAuEJaJXW/tZbb50wwNrFOQIIIIBADQWo1lpDMG5HINkEXJnizCsUZ6Omfiu/YsDN0nvQUNEqrbT0Etj0xRpZ9OiPf2fM/cMJqSjZLnXrEGxOr+8EdosAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIBBYg1BrYhV4EEEDAIzDOc+D51OqC3Xp29ZzyiQACCCAQoQDVWiOE4zEE7C7gcP4YZnW6fFbb9dIecuNdYyW/WZFPPyfpIbB7x1Z55dHpUl5WZm7YfapUKr7bKuKuTA8AdokAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCtAKHWaom4AQEE0lig0Nj7Ldb9a2XBrDqZ1m7OEUAAAQQiEKBaawRoPIKAjQUcdXPE2bSNOLLq+qyyTcfOcuPIcaKftPQUOLS3RH49ZYKcPH7sDMDpMqncs12ksiI9Qdg1AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQAFCrQFZ6EQAAQRMgYnG//okMjxVBfFBAAEEEIiOgOfX1WULVkRnQEZBAIHECGh11vwis0Jr1QUUFBbJgF+MFa3QSktfgaOHDslzE8eIfprNCLJqhVZ3+ZmKrekrw84RQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSsAoRarSKcI4AAAmcEtErrSCvGDcP7UKXVisI5AgggUEsBqrXWEpDHEUiwQKDqrJlZWXLNkJ/LVTffluDVMX2iBbQyq1ZoPbin5MxS3JVSsfuf4i77IdFLY34EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEbCjhtuCaWhAACCNhB4A5jET5VWrPqZErvwZfbYW2sAQEEEEgpAU+11pTaFJtBIB0EnC5xFrQQV8uO4sj68bdNbTp2lgdeeIVAazp8D1Szx/KyMlk4c5Ls3rH1zJ0aaC3ZLu5TpdU8yWUEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF0FaBSa7q+efaNAAKhBDSVMc56Q9/brxQNXtEQQAABBKIvQLXW6JsyIgKxFHBk54qrWRsRV6Z3Gqqzeik4+LfAokenydb1X3o9KvZ+I+7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwClCp1SrCOQIIICAy0kAorAqhVVo1cEVDAAEEEIiNgP6lgV5Uw44NLqMiEE0BozqrhlldzTv4BFo7XVgs0175A9VZo2md5GO9Nm+2rP9klXcXlQd2ifv4Ie85BwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEEqBSayAV+hBAIJ0FtErrRCtAj37FVGm1onCOAAIIRFmg721XyvJXP4jyqAyHAALREghUnbVe/QZy011j5aKrro3WNIyTAgLLl/xOPnv/Xe9OKg+VSOX3+7znHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQTIBQazAZ+hFAIF0FrjE27lOlVSFuuLNPunqwbwQQQCBuAm07t5bia7rLmvfWxm1OJkIAgTAEtDprk1biaFDgc7NWZx0yYbLk5uf79HOS3gIfvfWGLF/ykhfBfeygVB76znvOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKhBAi1htLhGgIIpKPANOumNWClQSsaAggggEDsBW6dMIBQa+yZmQGBsAWcuQXibNxKxAi2epqGWG/4z7upzuoB4dMrsPajlfLH3z7rPXef+F4q9n7tPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoECLVWJ8R1BBBIJ4H+xmbPt25YA1Y0BBBAAIH4CFCtNT7OzIJAtQIOp7iatvarznrx1dfKjb8YK/XqN6h2CG5IL4FNX6yR1+fN9m7aXXpUKvb8y3vOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALhCBBqDUeJexBAIF0EJlo32rFbe6q0WlE4RwABBGIsQLXWGAMzPALVCDiy6oqzsJ3op6dpddYhEyZLpwuLPV18IuAV2L1jqyycOUnKy8rMPvepUiPQukPEXem9hwMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwhFwhnMT9yCAAAJpIKAVWv1SGgNHX5cGW2eLCCCAgL0EPNVa7bUqVoNAegg46ueLq2VHn0CrVmed/OLrBFrT41ugxrvcu+sbefaBMd5Aq5wuk4rvtolUVtR4LB5AAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAgEqtfA8ggAACZwSmWSEIVVlFOEcAAQTiJ0C11vhZMxMCpoDDKc6CFuJs1NQLkpmVJTeP+aVcdNW13j4OEKgqcGhvifxm6n1y8vixM90V5VKxe4uI8UlDAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIBIBKrVGosYzCCCQagJapbW/dVMaqKIhgAACCCRGgL9YkBh3Zk1PAUdmlrhanOMTaG3W6iyZ8PSLBFrT81sirF1rkHXBjElycE/JmfuNyqxaodVdXhbW89yEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQCABQq2BVOhDAIF0Exhn3XBe04bSrWdXazfnCCCAAAJxFOAvF8QRm6nSVsCRnSuuVueJo26O16DrpT1kwlO/lRZtO3j7OECgqkB5WZn8esp9snvH1jPd7kqpKNku7lOlVW/jGAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEaCxBqrTEZDyCAQIoJFBr7ucW6p4GjrpesOpnWbs4RQAABBOIoQLXWOGIzVVoKOAtaiKu5EVx1urz7v/GusXLnw3OkXv0G3j4OEKgqoIHWhTMnydebN3q7K/b8S9wnj3nPOUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgUgFCrZHK8RwCCKSKwERjI3WrbkartPa9/cqqXRwjgAACCCRIgGqtCYJn2tQWcGWKq8U54szTv9tzpuXm58uEp1+UKwbc7OniE4GAAkuff0I2fbHGe61y39fiPvG995wDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBGojQKi1Nno8iwACyS6gSY6R1k3cMLwPVVqtKJwjgAACCRKgWmuC4Jk2ZQUcdXMko3UncdT7sRJrh64XyOQXX5c2HTun7L7ZWHQE3n75N/LZ++96B6s8VCKVRw96zzlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoLYChFprK8jzCCCQzAJ3GIv3qdKaVSdTeg++PJn3xNoRQACBlBOgWmvKvVI2lCABrczqatlRxKjU6ml9hw6XMY8/L/Xq/xhy9VzjE4GqAn9941XRL0+r/H6fVB76znPKJwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRESDUGhVGBkEAgSQU0DDrOOu6+95+peQ1bWjt5hwBBBBAIIECVGtNID5Tp4aAwymuZm3EWdDCux8Nsd49a670Hfpzbx8HCAQT+Pyv74pWafU09/FDUnlgl+eUTwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkmMNJYb2HVNWuV1oGjrq/axTECCCCAgE0EqNZqkxfBMpJPwOkSV1E7cTQo8K69TcfOMvGFl6XThcXePg4QCCaw6Ys1smTubO9ld+lRqdj7jfecAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAskioFVaJ1oX26NfMVVarSicI4AAAjYRoFqrTV4Ey0guAQ20tjhHHNm53nVffPW1Mubx5yW/WZG3jwMEggl8vXmjLJw5yXvZfapUKkq2i7grvX0cIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBBNAUKt0dRkLAQQSBaBa4yF+lRp1YXfcGefZFk/60QAAQTSUoBqrWn52tl0hAKOzCzJaH2eOOpke0foO3S4DJkwWTKzsrx9HCAQTGD3jq3y6yn3SXlZmXmLu+wHqdj9TwKtwcDoRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQiIoAodaoMDIIAggkmcA063qLr+kuWgWQhgACCCBgXwGqtdr33bAyewlokNXV6jyRjB/Dq0Pvmyx9h/7cXgtlNbYVOLS3xKzQevL4sTNrPF0mlSVbRSorbLtmFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJAaAoRaU+M9sgsEEAhfoL9x6/nW26n+ZxXhHAEEELCnAL9e2/O9sCr7CDiyc8XV4hwRp8tclFZlvfPhOXLRVdfaZ5GsJGIBrZ6qX7FsGmR9buIYObin5Mw0RpC14rut4i4/U7E1lnMzNgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIZECCAAAJpJjDRut+O3dpTpdWKwjkCCCBgUwFPtdY176216QpZFgKJE3DmFoiziVF53nHm7y7Wq99A7p41V9p07Jy4RTFzxALlZWWybf06+XrzP/79uVG0r+/Q4dKibYeIxw31oI7/7ANVAq3uyjOB1rIfQj3GNQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkgoBVai63rHDj6OmsX5wgggAACNhbQaq2EWm38glhaQgScjZqKs3Er79wFhUVmhdZYhR+9E3EQNYG9u76RbzZvlH8ZX19v2hjziqzWhWugdeHMST/Oq4HWku3i/uGE9VbOEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiZAKHWmNEyMAII2FBgmnVNnop/1n7OEUAAAQTsK+D5tZtgq33fESuLr4CzoIU48wq9k2qQdcS0OZLfrMjbx4H9BLau/9IIr34l32z5h1GN9Ss5euhQQhf5+rzZsumLNd41VO7fKe7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCIhwCh1ngoMwcCCNhBQKu09rcuZMfGnXJ9i9ut3ZwjgAACCCCAAAL2F3A4xdW0tTgaFHjX2qZjZ7l71lypV7+Bt4+DxAtoFVQNjJoBViPIqoHWcJr7VKlZKdWRVVcc9WL3Tt947glZ+9FK75IqD+ySyqMHveccIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBAvAUKt8ZJmHgQQSLTAuEQvgPkRQAABBBBAAIGoCWigtaidOLJzvUN2vbSHDHtwhmRmZXn7OEicwN5d35hB1k1rP/epgBp0Re5KM8DqPnnc+NSvEyKVFebtzvzmMQu1Ll/yO/n7X/7kXVbl4T1S+f0+7zkHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMRTgFBrPLWZCwEEEiWgP4/3lkRNzrwIIIAAAggggEBUBQIEWi+++loZMmFyVKdhsJoJeKqxblr7mWw2vg7uKQk9wOkyb3jVDLIaVVnj3T7/67uyfMlL3mndxw5K5cHd3nMOEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIi3AKHWeIszHwIIJEJgojFp3URMzJwIIIAAAggggEBUBQIEWvsOHS59h/48qtMwWHgCh/aWGFVYP5P1n66uthqru+wHcZceFffJY2eqsFaUhzdJjO5a/8kqWTJ3tnd0XVvF3q+95xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkAgBQq2JUGdOBBCIp4BWaR0ZzwmZCwEEEEAAAQQQiJWAq2lrcWTneocn0OqliNvBpi/WyKa1n5sh1r27vgk+r7vyTIhVg6ylR8RdXhb83jhf0T0senSad1YN2laUbPeec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAogQItSZKnnkRQCBeAnuMierFazLmia9ARUXFU06nc3x8Z2U2BBCwo8CAAQPkT3/6kx2XxpoQiJqAq1kbcTQo8I5HoNVLEdOD8rIy0aqmX61ZbX7qebDmrcaqIVYjzGrHtnvHVnnl0eni2Yf7VOmZQKsRwqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkGgBQq2JfgPMjwACCCAQkYDb7W5kfFGFNyI9HkIg9QQmT55MqDX1Xis7qiLgLGjhE2j96XX9pe/Qn1e5g8NoC2iQdYMZZF0tJ48fCzy8jauxBlrwob0l8uspE37cz+kyqdxjVGitrAh0O30IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIxF2AUGvcyZkQAQQQQCBKAnc4HI66URqLYRBAIMkFLrzwQjn//PPlf//3f5N8JywfAX8BZ35zceYVei9cfPW1cvOYX3rPOYiewNebN8pnf31XNny6So4eOhRw4GSoxhpo4bqf5yaO+XFfRpC1YvcWcZcHrzwbaBz6EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIilAKHWWOoyNgIIIIBAzASMKq3jjFBrzMZnYAQQSD6BBx98UG655ZbkWzgrRiCEgLNhE3HmF3nv6HRhsQyZMNl7zkHtBfbu+kY+X7lc1n28Ug7uKQk4oBlkPbpf3Ce+T8oQqFaa1Qqt3v2ZgdZ/JuVeAr4gOhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBFJGgFBryrxKNoIAAgikj4ARaO1v7LZN+uyYnSKAQDgCN910kxQWFsqePXvCuZ17ELC9gDO3QJxNWnvXqYHWOx+e4z3nIHKBQ3tLZP0nq+Tv7y4TDbUGbKfLpPLYIXEfOygaak3WVl5WJgtnTpLdO7ae2YK7Uir27BD3qdJk3RLrRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSGEBQq0p/HLZGgJpJ9Dm/xZmZFQGLq+VdhipveHMDhem9gbZHQIIRC5Qv4VktG8R9vMVJdvNyothP8CNCMRJwJHTSJxN23hna9G2gxlozczK8vZxUDMBDXd+/td35TPj6+vNGwM/XFEulceNaqwaZP3hROB7kqx30aPTZOv6L72rrtjzL3GXHvWec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAnQQItdrpbbAWBBColUBGhntirQbgYQQQQACBtBPQSpgVxo8TpyFgJwFHdq64Cs/2LkkDrWMff04ItHpJanSgVVn/unSJrPv4b3Ly+DH/Z43Kpe7jh8+EWVPs14PX5s02K9J6Nl25fydBfg8GnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAArYUINRqy9fCohBAoMYCba6oK3L0FhFHjR/lAQQQQACB9BXQapiOrLpJ/aPF0/ftpebOHXVzxFXUzvgtjdPcYEFhkRlorVe/QWpuOIa7Wv/JKvn7u8tk0xdrAs7iNgKslccOnwl5GsHWVGvLl/xOPnv/Xe+2Kg+VSOWR/d5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwowChVju+FdaEAAI1FsjMPHaH2+0orPGDPIAAAgggkPYCjtwm4j6wK+0dAEi8gAasXc07+ARaxzz2nBBoDf/daCXWz/76rny87PdycE+J/4Ony6Ty+71SefSgSGWF//UU6fnorTdk+ZKXvLvRMGvloe+85xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYFcBQq12fTOsCwEEaiRQUSF3Oc8UNKvRc9yMAAIIIICAs2HjM2GvFA648ZaTQMDpOhNoNT61aZBVA635zYqSYPGJX+LeXd/Iyt8vkXUfrZTysjK/BblLj5pVSrU6a6q3tYbBH3/7rHeb7uOHpHL/Tu95Mh3ce86TybRc1hqmAO81TKgku635Iz8G6ZNs6Sw3hADvNQQOlxBIUgH+PZykL66aZfNeqwFK0sv8ezhJX1w1y+a9VgPEZQQQQAABBBBAAAEE/i1AqJVvBQQQSHqBjPYXXiHiPt+6keXXnCvnNapr7eYcAQQQQCCNBU5VuKXLHzeIfnqb8WPenbkFRvXGfd4uDhCIt4CrsK1IRpY5rQZaxz5OoDWcd6ABzo//tFS+3rzR/3YjqF557JC4v98j7nL/oKv/A8nfs+mLNfL6vNnejWiYt2LvN95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwuwChVru/IdaHAAJhCFSOM35Or899FxTkEGj1EeEEAQQQQEAF6rgcMrRdY3npn/t9QJyNmhFq9RHhJJ4Czvzm4sjO9U45ZMJD0qJtB+85B74CRw8dks/++hf5eNlS0WNrc5f9IO4j+6Ty6EHj7z1VWi+n7PnuHVtl4cxJ3kq17lOlUrFnR1oZpOzLZWMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQBoJEGpNo5fNVhFIRYG6Hc9vc/q0o791b3d3amLt4hwBBBBAAAFTYEiHAr9Qq1bIdOQ0+v/s3Q10XGd97/v/3vOi99Ho1RKxEtmybMuE1E5wSJMDiF58naQraUgcmuWYkoi1sNNDMZQLuaX09gCltyWFtmlOIb1N4BTCgrAgNCzsA86CNDQBUg4+DZA3Eyex48hvkvUuWdLsffcz8mzN3jOSZo/2vH+ftYT288zez/Pszx4jZ/mn/0glfDQ5b4PiElDvO725097UdXveJ5dd/Ta7z8GigAqwPvbNr8qT33vEDm4uvmplNyeGxRg9K+b0ePJwRRyfOv6q3PuxP1p0mZ+V2IkXRaxqtTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESkmAUGspPS32igACKQLz80GrSmvSR0hbZ7TXBGXn2saUcxlAAAEEEEBACfQ0VMV/Tnz/tVEHiN7YJrHJEccYHQRyKaCFwhJY020vocKs1+0ZsPscLAgsG2aNzcUrsppjZ8Scm61IsuFTg/LFP/uITE9cCPPGA60vEGityHcDN40AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlL4AodbSf4bcAQKVK9DdXy0yfocbYF9fu3uIPgIIIIAAAg6BW7qbxB1qVR//rlXVivrIbhoCORfQdNE7e0X0QHyplo5Ouf2PP57zZUtpgeXCrOrPqTl6Wozxc9bvNxmldFu+7lUFWf+/T/6JDJ0cXJjXqswaG3ypbAK+f/vi/+WrF5PlX+DDG/8mZVGeawpJyQ2ke66v/9n7Su4+2LBT4A2ffsA5YPV4rikkJTeQ7rmW3E2wYV8F+DnsK2dBJkv3c5jnWpBH4eui6Z4rP4d9JS7IZOl+DvNcC/IofF003XP1dYESnQyXEn1wK2yb57oCUIm+zHMt0Qe3wrZ5risA8XJJChBqLcnHxqYRQEAJhIJj+0zRoskaVQFNbuhqSh7iGAEEEEAAgRQBVdF7S7Ranh2ZcbymRVrFPHPMMUYHgVwIBDrWiRa2fj/HaqFwWPZ9+nNSU9+Qi6VKbs6VwqzG8KCYVFWWudlZ+cInPiInjh5ZeMZWuDceaCWYX3LveTaMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCwKECoddGCIwQQKDGBmKG9V9edm961rlnaa/i/NqcKPQQQQACBdAK71jfLp37xuuMlPdIiKjAn1kea0xDIlYAebRetbvH3cnb/8Z/Kmq5LcrVcycw7fGpQDj70oPzi8cfigc3kjavKrIRZF0VUoPWfP/Un8srzv7YH44HW6XG7zwECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShA8qsUnxp7RgABCfZcfq1ostVNsWdDq3uIPgIIIIAAAmkF9vS0yt/96pSMzcYWX1cfCd/QJMbI6cUxjhDwUUCraRC9tcuesf9dvy9X9L/T7lfiQSLM+rMfHEi5fXNmUoxzJ6nM6pJ5+L575Lmf/9QeNU6/IubUmN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFSFSDUWqpPjn0jUOkCmux1E/R3RuIfJe0ep48AAggggEA6gaqAJrd2N8sDL55xvKw3dRBqdYjQ8UtAC4Ul0LHOnq73sm1y894P2v1KOxgbHpbvfvkLkjbMalUcjVdmpfJoytviyQOPiLJLNGPohBhjQ4ku3xFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaQFCrSX9+Ng8ApUpEN54xWbDkJvcd7+7p9k9RB8BBBBAAIFlBQY2taaEWiUQin80vDk5suy1vIiAJwFVBXjNelHvL9VaOjrlvf/3pzxNUS4nz83OymMPf9X6+oqo4+RmEmZN5kh77Ai0WlVsVSVbGgIIIIAAAggggAACCCCAAAIIIIAAAoUSeP3P3leopVnXJ4E3fPqBlJl4rikkJTfAcy25R5bRhnmuGTGV3EnpnmvJ3YTPGybU6jMo0yGAQO4FDEOzqrSajoW66sOyc22jY4wOAggggAACKwmsrVv4+fH910Ydp+rRdokRanWY0FmdQKD9YtGq6+KThMJhK9D6SYk0V94v5Dzz1BPy8H33OCqNKhTCrN7fX+b4kKgqrTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykmAUGs5PU3uBYFKEOjurxYZ2yeiOe72zo2tjj4dBBBAAAEEMhUYsH6GuEOtWk2DaFW1Yp6fynQazkNgSQGtNiJaQ4v9+s1790v35jfa/Uo4OHH0iHz7i38vR5457Lhdc2YyHsxUoVZa5gLm1JjETh/L/ALORAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBEBAi1lsiDYpsIILAgEAqO7TOtOmfJHlUBTW7trrxKZ8kGHCOAAAIIZC9wVXu9bIlWy7MjM45J4tVaT73iGKODgGcBTZfAmm77ssuufptc87s32f1yP5ieGJdHv/RFefJ733HeamxOjOFBMUbPOMfprSiggsCxwZes8rbGiudyAgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlJkCotdSeGPtFoMIFTJG9boI9Pa0SCQfcw/QRQAABBBDIWGDPhlb5+M9fc5yv1TeJnLU+2tsK39EQyFZAb10rEgjFL6+pb5B3f+Cj2U5Vctc9/sg35OBDXxIVbLWbFcQ0Rk6Lce6kiBGzhznITEBVj469foRAa2ZcnIUAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCCAoRaS/ChsWUEKlUg0HOFKmu22X3/t/cufpyv+zX6CCCAAAIIZCKwa12z/N2vT8rp6fnF060Km3qkZSF8tzjKEQIZC2g1DaI3ttnn37L3gxJpLv/q8keeOSwP/8M9cur4q/a9qwNzakyMM6+KOTfrGKeTocD8rBgnrQqthIEzBOM0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBUhQg1FqKT409I1ChApqmWVVarVqtSW3n2kbpaahKGuEQAQQQQAAB7wJVAU1u6GqSB150fhS6Hm2PV5XkY769m1b8FSoU3XaxzdB72Ta5csf1dr8cD4ZPDcq3779XnnnqCcftmbMzYpw9Hg+1Ol6gk7mAMR+v0EogOHMyzkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChNAUKtpfnc2DUCFScQ6t2+1TSNa903fku39dHQNAQQQAABBHwQ2NfXnhJqVR8br9VGxJwc8WEFpqgkAb25U7RwdfyWQ+Gw3P6RPy3r2z/0ja/INAiBxQAAQABJREFU/3zoQZmbTarCalUUNYZfXwiGl/Xd5/7mjJHTuV+EFRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoAgFCrUXwENgCAgisLGCa5nvdZ6kKrapSKw0BBBBAAAE/BNprgvGfK99/bdQxnQonxgi1OkzoLC+gVdWKqvKbaNfePiDNazoT3bL6fur4q/LQ5z4jrzz/a8d9GaNnxBg6IWIFW2kIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBApgKEWjOV4jwEECicQPfWqJWI2CeiOfZwe2+Lo08HAQQQQACB1Qrc1dcm7lCrCiiqL/P81Gqn5/oKEdDbL7H+2qLH77Z78xtlx++/pyzvPF11VnNmUowzx/jzUpZPnJtCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIPcChFpzb8wKCCCwSoFQUL/DFG3h83svzFUV0GRPT+sqZ+ZyBBBAAAEEnALbWupkS7Ranh2ZcbygN62R2MmXHWN0EEgnoDd1xEPQ6rVQOCy//8GPpjutpMfSVmc1DTGGB8U4d7Kk743NI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAYQUWygcVdg+sjgACCCwrYJjafvcJKtCqgq00BBBAAAEE/BZ436a2lCm1OqtoeCCUMs4AAskCWrha9OZOe+id736PXLS+1+6Xw4GqzvrZ/3qHvPL8r+3bUdVZY8efI9Bqi3CAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZChBqzVaO6xBAIC8CgZ4rbtI06XYvNrCJKq1uE/oIIIAAAv4I3HBxk7TXuD7QwPooeb0xNezqz4rMUi4CetvFItZ7RbU1XZfIO9+9p1xuTVR11s9/6P3y3S99UeZmZxfuS1VnHTohsdeeF3PWWd24bG6cG0EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbwKEGrNKzeLIYCAVwFNM1KqtO5c2yhr68Jep+J8BBBAAAEEMhJQlcB3dTennKs3Wr9QcSGwmPIiAxUvoELPWk2D7XD7R/5UQuHy+PsK1Vntx8oBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjkWMBVgirHqzE9Aggg4EEg1Lt9q2ka/e5LdvekBo3c59BHAAEEEEBgNQJ3bmyTB148I+dj5uI0gZBodVExJ4YXxzhCQAlY7w295SLbov9dvy/dm99o90v1QFVnfehzn5FXnv/14i2o6qzDg2KcO7k4xhECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgkwChVp8gmQYBBPwXMAxjr6Y5590SrZb+zohzkB4CCCCAAAI+C7TXBEVVBn/01RHHzHrTGokRanWY0BHRmzqs/wnEKVo6OuWGO/eVPIuqzvo/H3pQ5mZn7XsxZybFOP2KmLMz9hgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgJ8ChFr91GQuBBDwT6B7e4emxe6wPufZMeeu9VRpdYDQQQABBBDImcBdfWtSQq1aVa1o1XWiwn00BOICqkprY6uN8a73f1BC4bDdL7WD6YlxeejzfynPPPXE4tapzrpowRECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORUQM/p7EyOAAIIZCkQCsZuswKt1cmXq6p5e3oWQyPJr3GMAAIIIICA3wKqOrj6crd4VU73IP2KFYi/H7SF/6y6aH2vXHb120rW4sTRI/LZD9zpCLSqAHfs+HNinDtZsvfFxhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoHQECLWWzrNipwhUjkB3f7Upcrf7hm/oapKqgLNyq/sc+ggggAACCPgpoKq1uptWGxGxqnPSENDC1Y4qrdftGShZlCe/9x35/IfeL0MnB+17MEbPSOzEi2LOzthjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FAjmcnLmRgABBLIRCATGr7WqtHa4r93X1+4eoo8AAggggEBOBXaubRRVKfz09PziOlZVTj3aLsbQicUxjipSQG+5yPory8LvCfZetq0kq7TOzc7Kw/fdIz/7wYHFZ2gaEjv1qpgTw4tjHCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5EKBSax6QWQIBBDwLpFRpTYSKPM/EBQgggAACCKxCQFUI393TmjKDHmmxw4wpLzJQEQJaVa1odVH7Xm/et98+LpWDU8dfjVdnTQ60qqqssePPEWgtlYfIPhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoMwFCrWX2QLkdBEpdINS7faumyVXu+xjYmBoocp9DHwEEEEAAgVwI3N7TIirc6miBkMSDrY5BOpUkoDd32rd72dVvk4vW99r9Ujh45qkn5PMf3isnjh6xt6sqs8Zee15UsJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIFEIgWIhFWRMBBBBYSsAwjP1WqNXRtkSr5ar2escYHQQQQAABBPIl0F4TlBsvaZJvHnV+FLsWsX7hYvRMvrbBOkUk4K7Set2egSLa3fJbmZudle9+6Yvy+CPfWDzRNMQYOiHGyOnFMY4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAghQqbUA6CyJAAJLCHRv79A08zb3q3s2UKXVbUIfAQQQQCC/AgMb21IWjAcbaxpSxhkof4FSrdI6Njws//CxDzgDrfOzEjvxIoHW8n/bcocIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIQAodaSeExsEoHKEAgGjTtEtOrku1XV8Xata04e4hgBBBBAAIG8C6iq4dta6lLW1aPtKWMMlLdAqVZpfeX5X8tn3r9b1PdEM6fGZP7Ys2LOTCaG+I4AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUVIBQa0H5WRwBBGyB7n4rzGrut/sXDnZ1N0tVQHMP00cAAQQQQCDvAnf1panWWhcVCYTyvhcWLJxAKVZpfe7nP41XaJ2eGLfhjOFBib1+RMSI2WMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUWINRa6CfA+gggEBcIBCZusqq0diRzqDDrnWk+7jn5HI4RQAABBBDIl0B/Z0RUBXF305scP77cL9MvIwGtNiKaCjJbLRQOy7s/8NGiv7unDx2QL3ziIzI3O7uw1/lZiZ14UYzh14t+72wQAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQqT4BQa+U9c+4YgaIUME3zbvfGlgoPuc+jjwACCCCAQD4E1C9b7OtrT1lKb2iyfi+Dv1anwJThgN5ykX1X1/zuuyTS3Gz3i/Hg4FcflK9+7jP21szzU1ag9QUxpxcrttovcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUgQD/+l4ED4EtIFDpAqHe7Vt1Xba6HdJ9zLP7HPoIIIAAAgjkU+CGriZR4VZHC4REb2x1DNEpPwGtuk60qtr4jakqre+8dU9R3+RDn/+MHPzqA/YezZnJeIVWc+5CxVb7FQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB4hEg1Fo8z4KdIFDBArGUKq1botWyraWugk24dQQQQACBYhRorwnKrnWp1Tm1hpZi3C578lFAb2yzZyvmKq1zs7PyhU98RH72gwP2fs3JkXigVYyYPcYBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUIwChFqL8amwJwQqSaB7e4dpyk3uW76rb417iD4CCCCAAAJFIbBnQ2pVVlXBU6uNFMX+2EQOBPSAaPVN9sRv2XGdfVxMB9MT4/L5D71fnvv5T+1tGaNnJDb4kohp2GMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUqQKi1WJ8M+0KgQgSCwfn9Ilp18u2qKng71zYmD3GMAAIIIIBA0QioauJXtden7Ce5kmfKiwyUtIAesSrxagv/6dS9+Y1y0freoruf4VOD8vkP75UTR4/YezOGTohx5pjd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAodgFCrcX+hNgfAuUs0N1vhVm1O9y3uLunVaoCmnuYPgIIIIAAAkUjMLAxTbXWuqhooXDR7JGN+CegNVih1gvtv/xuSoH5xEsF+66CrJ//8Pvl1PFXF/ZgVWU1Tr8ixrmTBdsTCyOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZCBBqzUaNaxBAwBeBUGjsNivU2pE8mQqz3t6zGBxJfo1jBBBAAAEEikVAVRTvqk8NsGqNa4pli+zDJwGtuk60qtr4bDX1DbLt7e/0aWZ/pnnu5z+Vez/2RzI2PLwwoRVojQ2+JMbYkD8LMAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQRwFCrXnEZikEEHAKxGLafueIiAoJtdcE3cP0EUAAAQQQKDqBO9NUa9UbrQquFz6mvug2zIayEtAb2+zrLn/7/yGhcGqY2T4hzwcq0PrPn/oTmZ4YX1jZiEnstRfEnBrL805YDgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBHgFCrP47MggACHgVCGy6/Stdlq/uyu/qocOc2oY8AAgggUJwCt3Y3i6ow7mhWoDUebHUM0ilZAT0gWn2Tvf3/8rs32ceFPkgEWudmZxe2Mj8rsePPinl+qtBbY30EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgawFCrVnTcSECCKxGwBS52339tpY62RKtdg/TRwABBBBAoCgFIuGA7OmxKrO6mhZZrOzpeoluiQnokRa78u5F63tFfRVDO3H0SLxCqyPQesKq0Dp3IeBaDJtkDwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAWAoRas0DjEgQQWKVA9/YOa4Zr3bPc1UcIyG1CHwEEEECguAV2rW9O2aAWrhatLpoyzkDpCWgNVqj1QiuWKq0q0Hrvx/5ICLQmngzfEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgnAQItZbT0+ReECgRgWDQtKq0ao6SrO01QenvjJTIHbBNBBBAAAEEFgRUhfF0P7/iFT5BKmkBrbpOtKra+D2EwmG5csf1Bb+fRKB1emJ8YS9GTGJUaC34c2EDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgnwChVv8smQkBBDIR6O63wqzGbe5T79zYJlUBzT1MHwEEEEAAgaIX2N2TplqrValVVWylla6A3rhYQV4FWlWwtZAtfaD1RTHnZgu5LdZGAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEfBUg1OorJ5MhgMBKAqHQ+B1WldaO5PNUmHVXd2ogKPkcjhFAAAEEEChWgZ1rG6WnoSple1pkMRSZ8iIDxS2gB0Srb7L3+JYCV2ldMtB6fsreIwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAOQgQai2Hp8g9IFBCArGY7HVv98ZLmqS9Jugepo8AAggggEDJCNze25KyV72xVcQKR9JKT0CPWM9TW/hPpYvW90r35jcW7CaGTw3KFz7xxzI9Mb6wByMmsRNWhVYCrQV7JiyMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5EyDUmjtbZkYAAZdAcMOb+3VdtrqGZWAjlezcJvQRQAABBEpLYE9Pq6jK445mhSLj4UjHIJ1SENDqovY237LjOvs43wcq0PoPd/+RjA0PLyxNoDXfj4D1EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgzwKEWvMMznIIVLaAsd99/1e118uWaLV7mD4CCCCAAAIlJaACrSrY6m56dI17iH6xCwRCotU0xHcZCoflLTuuL8iOE4HWoZODC+ubhsReP0KF1oI8DRZFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF8CRBqzZc06yBQ4QLVm7d2W5/je5ObYWBjagDIfQ59BBBAAAEESkHg9l7rI+vdLRiW5Kqf7pfpF5+AXhexN9W9+Y1SU78QcLUH83AwPTEuX/jER8QRaB18ScyZyTyszhIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA4QQItRbOnpURqCiB+flgSpXW9pqg7FzbWFEO3CwCCCCAQPkK9DRUpf25pje2le9Nl+GdafXN9l1d/vZ32sf5PPjyX/03OXX81YUlVYVWFWidGsvnFlgLAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKIgAodaCsLMoAhUm0N1fLWLe4b7rfX3t7iH6CCCAAAIIlLTA7p7FQGTiRrTaiGhVtYku34tZQA+IVlNv77DvzW+xj/N18O3775Xnfv5Te7nYqVcJtNoaHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQLkLEGot9yfM/SFQBAKh4Ng+axvR5K1UBTS5oaspeYhjBBBAAAEESl6gvzMiW6LW73K4mhZpdY3QLUYBrdaqIK8t/CdS9+Y3SvOazrxu88nvfUcef+Qb9prGuZNiTgzbfQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcg1FruT5j7Q6AIBGKG9l73Nvb0tEp7TdA9TB8BBBBAAIGSF9i1PrVaqx5pEQmESv7eyv0G9Hor1Hqh9b35qsRhXr4feeawfPv+v7fXMidHxBg6Yfc5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpBgFBrJTxl7hGBAgoEey6/Vtdlq3sL6QI/7nPoI4AAAgggUIoC6hc3IuGAc+tW9U+9gQrlTpTi68UrtV7Y1mVXvzVvGxw+NSj//Kk/kbnZ2fia5vkpiZ18OW/rsxACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQLAKEWovlSbAPBMpVQJO97ltb6qOZ3efRRwABBBBAoBQFqgKa3NqdplprU0cp3k7F7FmrqhXRF8LINfUNctH63rzc+/TEuHzhEx8R9T3eYnNinHxJxDTysj6LIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUkQKi1mJ4Ge0GgzATCG6/YLKLd5L6t3T2pQR/3OfQRQAABBBAoZYGBTa2p2w+ERKuLpo4zUhQCWk29vY/ey7bZx7k++PJf/Tc5dfzVhWWsIGts8CUx5xYqtuZ6beZHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEik2AUGuxPRH2g0AZCRiGllKltas+LDvXNpbRXXIrCCCAAAIIpAqsrUv/806PtqeezEhRCGg1DfY+Nrxpq32cy4Nv33+vPPfzn9pLxE69KubMpN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKg0AUKtlfbEuV8E8iXQ3V8tYuxzL3fnxjSV69wn0UcAAQQQQKAMBAbS/MxTwcn4x9yXwf2V2y0kh1p7f+vynN/ek9/7jjz+yDfsdYxzJ8WcGLb7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKVKECotRKfOveMQB4EQsExK9CqWcHWxVYV0OTW7ubFAY4QQAABBBAoY4Gr2utlS9TxozB+t1RrLb6HHg8a64H4xmrqG+Si9b053eSRZw7Lt+//e3sNc3JEjKETdp8DBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFIFCLVW6pPnvhHIsYApste9xJ6eVomEFwIj7tfoI4AAAgggUI4CezakVijX6ptEAqFyvN2SvSetpt7ee+9l2+zjXBxMT4zL//ir/0fmZmfj05vnpyR28uVcLMWcCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQMkJEGotuUfGhhEofoFAzxU3WVVaN7t3entvi3uIPgIIIIAAAmUtsGtds7TXBJ33qOmiR/iZ6EQpbE+rabA30HfFW+zjXBw8fN/fyNjw8MLUsTkxTr4kYhq5WIo5EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZITINRaco+MDSNQ/AKapqVUad25tlF6GqqKf/PsEAEEEEAAAR8FqgKa3NBlVWZ1NT3abv3+B38Vd7EUrKtV1dprd/e90T72++DpQwfkfz3+mD1t7PQxMecWKrbagxwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACFSzAv6RX8MPn1hHIhUCod/tWq9zYte65b+lODfS4z6GPAAIIIIBAOQrs62sXFW51tEBItNqIY4hOgQT0gEgwHF88FA7LRet7c7KR4VOD8q3777XnNseHxJwcsfscIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiIEGrlXYAAAr4KmKb5XveEqkKrqtRKQwABBBBAoBIF2muC0t+ZGmDVmzsrkaPo7lmrrrP3lKtAq1rgy//vn8v0xHh8LXN2RlSVVhoCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgFOAUKvTgx4CCKxGoHtrVMTY557i9t4W9xB9BBBAAAEEKkrgrr62lPtVH3mf/LH3KScwkBeB5GfQ3XdpTtY8+NUH5ZXnf70wt2mIcfoVq7C9kZO1mBQBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZQFCraX89Ng7AkUmEArqd4ho1cnbioQDsqenNXmIYwQQQAABBCpOYFtLnWyJOn5Exg30pjUVZ1FsN5xcqXXt+g2+b0+FWR97+Cv2vMa5U2LOTNp9DhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEFgUItS5acIQAAqsUMExtv3uKW7ubpSqguYfpI4AAAgggUHEC79uUplprnVXkPBCqOItiuuHkSq0X9fT6urW52Vl56HOfEfVdNRVmNYZf93UNJkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKSYBQazk9Te4FgQIKBHquuEnTpNu9hYFNVGl1m9BHAAEEEKhMgRsubpL2mqDz5jVd9MbUsKvzJHo5E9ADIsFwfPpQOCwXrfc31PrwfffIqeOvLmzfNMQ4dTRnt8LECCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQDkIEGoth6fIPSBQBAKaZqRUad25tlHW1i0ERYpgi2wBAQQQQACBggqoyuW7e1J/2UNvtMascCst/wJauNpetH3tJfaxHwfPPPWE/OwHB+ypjDPHxJxbqNhqD3KAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIOAQ4F/PHRx0EEAgG4FQ7/atInq/+9rdPc3uIfoIIIAAAghUtMDtPS2iwq2OFgiJVhd1DNHJk0Cwyl6o42L/Qq1jw8OiqrQmmjkxLMbYUKLLdwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBJQQItS4BwzACCGQuYBjGXvfZW6LV0t8ZcQ/TRwABBBBAoKIF2muCoiqZu5vetMY9RD8PAlposaJ885pO31b87pe/ICrYGm+xOYmdPubb3EyEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLMAodZyfrrcGwL5EOje3qFp5h3upfZsSP14Zfc59BFAAAEEEKhEgbv6UgOsWlWtaNV1lchR2HsOJoVa2zt82csrz/9afvaDA/Zc8UCrEbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILC1AqHVpG15BAIEMBELB2G1ixXCST1VV6Hata04e4hgBBBBAAAEELgioaubqy930Jn9Cle556S8toIUXn0N71yVLn+jhlW/ce499tjk1JubkiN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgeQFCrcv78CoCCCwn0N1fbYrc7T7lhq4mqQpo7mH6CCCAAAIIIHBBIG211tqISCCEUR4FtFCVvVrLmtWHip/83nfkxNEjC3Oahhhnj9vzc4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLIAodaVjTgDAQSWEAgExq+1qrSmJED29bUvcQXDCCCAAAIIIKAEdq5tFFXZ3NE0XfQoP0MdJrnsWN4SDMdXCIXD0rymc1WrjQ0Py8GHHrDnMEZOizk7Y/c5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYWYBQ68pGnIEAAksLpFRpTRvSWfp6XkEAAQQQQKAiBVRF8zs3tqXcux5psX5fhL+ip8DkYEALLoaKI82W+yqbCrSqYGu8xebEGB5c5YxcjgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSeAP9iXnnPnDtGwBeBUO/2rZomV7knG9jY6h6ijwACCCCAAAJpBHZ1N4sKtzpaICR6Q5NjiE6OBCzrRGtuTyk8n3gpo+8njh6RJ7/3Hfvc2OljIqZh9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMhMgFBrZk6chQACLgHDMPa7hmRLtFquaq93D9NHAAEEEEAAgTQC7TVBufGS1ACr1tie5myGfBdICrXW1Desavpv3HuPfb05PS7m5Ijd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIHMBQq2ZW3EmAggkBLq3d2iaeVuim/j+vk2pH6OceI3vCCCAAAIIIJAqMLAx9WenVlUrWs3qQpapKzHiFtACAXuooanZPvZ68PShA/LK879euMyqzmqcfc3rFJyPAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIHBBgFArbwUEEPAsEAwad4ho1ckXqmpzN1ycWm0u+RyOEUAAAQQQQMApoKqcb2upcw5aPT1KtdYUFL8HglX2jJGmFvvYy8H0xLg8+qUv2JcYo2fFPD9l9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPAmQKjVmxdnI4BAd78VZjX3uyF2dTdLVUBzD9NHAAEEEEAAgRUE7upLU621LioSCK1wJS+vSiAQtC+PZFmp9eBDX5Kx4eGFeWJzYgydsOfkAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuwChVu9mXIFARQsEAhM3WVVaO5IRVJj1zjQfn5x8DscIIIAAAgggkF5g59pGURXP3U1vcvy4db9Mf5UCWnAxNFxdV+95NhVmffyRb9jXxc68Zv3ej2H3OUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8C5AqNW7GVcgUNECpmne7Qbo74ykDeO4z6OPAAIIIIAAAukF9vW1p7ygNzRZv0fCX9dTYPwa0AP2TJHmFvs404PHvvlV+1Tz/JSYExcqttqjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBXgX8m9inE+AhUsEOrdvlXXZaub4EOXUknObUIfAQQQQAABLwI3dDWJqnzuaIGQ6I2tjiE6xSGgqrQ++b1H7M0Yw4P2MQcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtkLEGrN3o4rEahAgVhKldYt0WpRXzQEEEAAAQQQyF6gvSYou9Y1p0ygNXivIJoyCQPpBZKq4IbC4fTnLDGqqrTOzc7GX41XaZ0cWeJMhhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEvAgQavWixbkIVLJA9/YO05Sb3AR39a1xD9FHAAEEEEAAgSwE9mxIrcqqVdWKVhvJYjYuWUlAC4bsUyJNqYFi+0XXAVVaXSB0EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAR8FCLX6iMlUCJSzQDA4v19Ec5RkVVXldq5tLOfb5t4QQAABBBDIm4CqfH5Ve33KenpjW8oYA4UT+MW/HaJKa+H4WRkBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpcgFBrmT9gbg8BXwS6+60wq3aHe67dPa1SFdDcw/QRQAABBBBAIEuBgY1pqrXWRUULhbOckcv8FJibnZXHvvlVe0pz7Kx9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwOoFgqufghkQQKDcBUKhsdtMU+tIvk8VZr29pyV5iOMiFgh3rRe9vlFCnRfbuwx1dFm1d2vj/djIWYmNDC0cn1PHZ2X+3FD8u30BBwgggAACORdQFdC76sNyfGLWsZbWuEbMs8cdY3TyL/Dk9x6RseHhhYVjc2KMLfzszP9OWBEBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoTwFCreX5XLkrBHwViMW0/bqrrrMK3bTX8H8hvkL7OFnVuk0SXrdZwt3WdyvQqgVDK8y+Ke3rKtx6/uUXZNb6Ut9V3+/WMvAxUfvNpKn1T33u7kxO5RwEykogevOA1G67JqN74s9JRkxFfdKdVrXWT/3idcce9UiLGEMnREzDMU5nFQKat2rz7iqtxrmTPI9V8HMpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkE6ARFo6FcYQQMAWCG24/CpTZKs9cOHgrr417iH6BRbQraqrdVfvkLrffqeoYz9aINpqBenU10KYToXlxn/4qEwdftKP6ZkDAQQQQCCNwK3dzfLX/zko52PWT+BE0wOiN7aKMXI6McL31QoEVvqFD+cCTx864KzSOur/L3o4V6SHAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJ4AodbKe+bcMQKeBKw4TUpZzG0tdbIlWu1pHk7OnUAuwqxL7VaFXFXFyPq3Xidjh74lM88dXupUxhFAAAEEshSIhAOyp6dVHnjxjGMGLdImQqjVYZLPzr9/7zv2clRptSk4QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHwVINTqKyeTIVBmAt3bO0Ri17rv6q4+K1RDKwqBqt5Lpende32rzJrpTQXbOqV59wdkbvCYjDzypfj3TK/lPAQQQACBlQV2rW9ODbWGq0Wri4o5ObLyBJzhq8AzTz0hJ44eWZjTNMQYP+fr/EyGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIILAgoAOBAAIILCUQDJpWlVbNUZK1vSYo/Z2RpS5hPI8CtduuiQdLVaXWQrVQ58XSMvBRUeFaGgIIIICAfwKqIvrOtY0pE+qRlpQxBlYvMHZueNlJ/u07D9uvG6Nnrd/5mbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+CdAqNU/S2ZCoLwEuvutMKtxm/um7tzYJlUBzT1MP88CKtAavXlAtGAozyunLqdCtapqK8HWVBtGEEAAgdUI3NLdlHK5qtSqWRVbaf4KzM3OLjnh8KlBOfLMYft149xJ+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAF/BQi1+uvJbAiUjUAoNH6HVaW1I/mGVJh1V3dz8hDHBRCoedOV8UBrAZZeckkVrm35gw+LCtvSEEAAAQT8EVCVWnsaqlIm0yJtKWMM5E7gZ4cO2pObU2NUabU1OEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8FyDU6r8pMyJQFgKxmOx138iudc3SXhN0D9PPo4AKjzZen1JAN487WH6pxhvfI6HOi5c/iVcRQAABBDIWuL23JeVcvbFVRA+kjDOQG4GnHztgT2yMDdnHHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+C9AqNV/U2ZEoOQFghve3K/rstV9I3s2WCEaWkEFaq/sF72+saB7WG5xFbqNvutOUd9pCCCAAAKrF9jT0yqqUrqjabrokdSwq+McOp4Ezp0aTHv+kWcOy9DJC68ZMTEnR9KexyACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgD8ClFz0x5FZECgzAWO/iDNAc1V7vWyJVpfZfZbW7aigaMNbr1vVpmeeOyzTv3xajIlRMWamZW7wWHy+qnWbRKz5w109Ur1566qqrapKrdV92+LrrGqzXIwAAgggEA+0qmDrAy+ecWjo0TVijJx2jNHxX8BRpXV8WMQ0/F+EGRFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEbAFCrTYFBwggoASsQGP3/Lx2k1tjYCNVWt0m+e7Xbrsmqyqt5vycjB36lkw/sxBmTbfv8y+/EB8+f+RXMv7DfxUVcq1/6/VS1XtputNXHKvd3k+odUUlTkAAAQQyExjYlBpqlWBYtLoolUMzI1zxrNHhoZRz5mZn5RePP2aPm+Op59gvcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4IuA7sssTIIAAmUjMD8ftKq0Olt7TVB2ri3ej7x37rZ8e8HOLs83pwKtw1+7TyafOhSvzprpBCrkOvQvfyvjP3o000sc56lQbCBKENqBQgcBBBDIUmBtXTjtz2G9sS3LGbnMLTA+cs49JIf/7TFRwVbVzPNTYs5MppzDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAvwKEWv31ZDYESlugu7/aim3c4b6JfX3t7iH6BRAItnZ6XnXswNdFVV/NtqmqrSoQm00LZRHCzWYdrkEAAQQqQWB3T3PKbWq1EdGqalPGGfAuMH4utQrrzw4dsCeiSqtNwQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQUwFCrTnlZXIESksgFBzbZ+04mrzrqoAmt3anBmmSz+E4PwKhtg7PC82sItCaWGz04Ndl9vjRRDfj71XdmzI+lxMRQAABBJYX6O+MyJao9bsnrqY1tLhG6GYjMDbsDLUOnxqUI88cXpjKNMQYT63kms06XIMAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLwAodblfXgVgYoSiBnae903vKenVSLhgHuYfp4FtGBI9PpGT6vGRs6K+vKjTf3H456n0Ru87dfzAlyAAAIIVJjArvWpv2SiN7aKBEIVJuH/7Q6fPumY9GeHDtp9c2pMJDZn9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMidAKHW3NkyMwIlJRDsufxaXZet7k2nC9C4z6Gfe4FgW6fnRWLjVgjHpzZ/dtDzTF5DuJ4X4AIESlRA/Xnmz0eJPrwCbzvtL5pouugNTQXeWekv767U+vRjB+ybMsacVVztFzhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBdIOj7jEyIAAKlKaDJXvfGl/qoY/d59ItTINjW4dvG5s8NyfmXX/A03/zgMU/n+3WyXl0rNZdfI+p7INpifbVKoD4igaZWmT1+NL6MCumeP/Kr+Jc5n9/qeyrMWLVuk4TXrpdg58Xx/QSbFvaZbKD2qvZmzkzJ3MnjC/s+s7Bvwxrzq9VuuyZuk8l889b60798etlTa950pajQZqijSzTrGYQ6u+LP4tTn7l515WA1r5pftcT8bju1x9iEFei27GZ+8yuZtd63cwV6L6p9Kt+q3kvjIdaQ9WfSHWZN7Pe8tdepXzwpxsSouoyGQFqBqoAmt3Y3ywMvnnG8rjd1iDFy2jFGx5vA3OysTE+MS019gxx55rAMnbzwyxxWhdZ4pVZv03E2AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECWAoRas4TjMgTKSSC88YrNhiE3ue9pd0/qxxy7z6GfH4HYubOeF4qHO60A4EohxEwmVkG7oQc/m8mpBTsn3LVearf3x0OPWjD9R3GrMKlq6nudda4KjU49/biM/+hR8TMo6kYIWeHV+rdeJ2qPKmSbSVPnJlp137bEYfy7CuSq5zp1+EnHeDadGhW6vOCy0vUq2Jzu/aSCmrVWkLhu+9szvr+V1kq8rp6lCrKqZ5tsknjd/V0FX9WXaipMqpp6/85YZuM/fHTVwdr4hCv8j/Kou7I/7uEOsbovTexXPYPIjlvivpM/OWQHsN3n00dgX197SqhVAiHR6qJiTo4AtAqBsXPD8VDrb6xQa6IZE5apaSS6fEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLEAodYcAzM9AqUgYBiaVaXVdGy1qz4sO9c2OsboFE5ABS7VlwqqemmN198Wr/KpqkGWa1OhwOi7BjIKPLoNVGCy7uod8cqu5x6+P1651X3OavoqzNrwjhvFHUpdzZzqWhXWVF91v/1OGTv4dc9VdFe7fvL1yq/xutuSh3w5Vs9VBYFVoHWpkHKmC8VDt1Z4V1VNVUHgXIZbG37n9+L7znbP6n7V16j1XCefOpTpLXJeBQm01wTjP5+//5qzqq8ebZcYodZVvROGTw3Kmq5L5JmnnrDnoUqrTcEBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBcBPS+rsAgCCBSvQHd/tVXHcJ97g3duzKyapPs6+rkTyKpaq1UxsvX9H/c9VJm7u/Q2swr/qfvLpILncjOrsHDLH3w4Hnpc7jwvr6lwY9sf/nlO7VVotmXgY/EKn1725te5jTe+JyeBVhUCVs9VhVCzDYcudY9qzjUf+et4mHmpc7IZV/uM3jwQDzH7sWcVFFbz+TFXNvfDNcUtMJDmZ7RW0yBalbdffCjuu8z/7k4df1WmJ8blxNEj9uLm9Lh9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQO4FCLXm3pgVEChqgVBwzAq0alawdbFVBTTZ00OodVGkOI5iI0NZbUQFNpt3fyAefsz0Y+azWijPF6lAZdO793quXrvcNlWIMPGR9cudt9JrKtCqKrTmq9W/7fp4ADJf66mgpQoB123v931JZafer16rEnvdiAqNqveQHy3+Z8znULTalwrg+rVHP+6TOYpH4Kr2etkSdfzojm9OVWulZS9w6rVj8uzPf2ZPYM5MWr/3E7P7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5F6AUGvujVkBgaIWMEX2ujeoAq0q2EorLoHzr7ywqg2pQKuq6tn+wb+IhxFzHRpc1WZXuDiy45acBCrVsk1WsHU1NvkOtCaoVABSrZ3LlqgaqkK0foR/k/caD4ZaYdZ8hoFVKFeFcxP3lbyfTI/jAd+Bj0quAuPqudZdvSPT7XBeBQns2ZD6yydafZNIIFRBCv7e6omXjsjzv0gKtU6N+bsAsyGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIrChAqHVFIk5AoHwFAj1X3GRVad3svsPbe1vcQ/SLQGDq6cfFmBhd9U6CbZ3x6o9r7v58vLqn+qj31YT6Vr0hjxOogJ8KVeaq6fWNWQcrVdAzn6FMt0H9W6+TQDQ16OY+L9t+oCESn1+t43dTVXfVezHfTT2z1VRDVc871HlxTretQtzhtetzugaTl57ArnXN0l4TdG5c00WP8DPciZJ5T1Vq/c0zh+0LzOlx+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CLj+JTw/i7IKAggUh4CmaVaVVqtWa1LbubZRehqqkkY4LBYBc35Oxn98UNTHpvvRVJBVVYFUX6rNPHdYzh/5lUxb3/0Iz/qxR/ccKnDq1/27507u11x+jYwd+pYocy9NVf5cTTNmpsS0vrINpqpn2nj9bTL8tftWs41lr1Xz+x2CVu9Bvyu/LnsTrhfV+rMvvyBTh590vbJ8VwXE81FFVXmrtWgIJAuoiuq7upvlH587nTwserRdjBFrzDQc43RWFpieGBf1FW+WnzkzufJFnIEAAjkV+PDGv8np/ExeGAGea2Hcc73qGz79QK6XYP4CCPBcC4DOkgjkWICfwzkGLtD0PNcCwed4WX4O5xi4QNPzXAsEz7IIIIAAAggggAACJSdAqLXkHhkbRsAfgVDv9q2maVzrnu2Wbuuji2lFK6CqtTZYVTJVuNPvpqpkqi9VtXJu8JhM/e+nZObZwxIbOev3UlnP53eYcqmN6NW1UvOmKz2FHFW1zmwqjaog5bT1dd4KVSY3FfJUIVmvc6rz1f5VQNbvpsK22QZul9qLmi+bSqnqfTl7/KjMnz1pBVKfj08ftJ5BsLFFwus2ea6eqvYw+5o135nBpbaaMq4C1vl6T6YszgAClsCdG9vkgRfPyPlY0i+oBEKi1UbEnBzBaBUC5tQYweBV+HEpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkK0AodZs5bgOgRIXME3zve5b2BKtFlWplVa8Aqpy6LlvPyjNuz+Q0zCdCmg2qi8rtFesAdd0T0kFOecGj9sv6dU1nsONiYtVqNRL5c7a7W9PXJrRd+U6/LX/vmRoWFXNVV9VVkCz+Q8+7Ol5q2Crl71ntOEcnRS9ecDTvak/A5NPHZLxHz2aUkk3ORisqq9GrKqyKuCbSVPh1MiOWzKucqtCz9lUl1X7n/7l0zL1H4/HQ7lqb2rtcNd6CbZ2SsPv3JiT0HomBpxTegLtNUHp74zI918bdWxeb+qQGKFWh4nXjjl9oWKr1ws5HwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgVQKEWlfFx8UIlKhA99aoiLHPilI5bmDX+mZHn05xCqigo/p4+VwHWxN3nxxwVaHByZ8ckpnnDideLorvKsA5/sNH0wZEVSg0YoVz1X14aaGOLi+nS7UVgs20qfDtcoHW5HmU+dihb8UDxsnjyx17DeQuN1emr6mwpnpfqMqpqtqpMbEQsps7c9I+ds+lKtGq55NpU3OeffCejKqpqvfEjPVnpXXgoxJs68xoCeWmqiAn9r7cRbXW3r025TLyyIN2mDVxvbJTz1l9qX03vONGqX/b9YmX+Y7AsgJ39bWlhFq16jrRqmrFPO9/xeZlN1NGL5rTE2V0N9wKAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUDoChFpL51mxUwR8EwgF9TtM0aqTJ4yEA7KnpzV5iOMiFsh3sDVBoQKI6kt99PvEEwfjFSdz8TH3ifVW+r5StVN1vQoKDv3L30nb3o+L+qj7TJsKQqoqn5nen5e5Rx/9StoA7lJ7U5VJa7denXEwV1X9zFebPX5Upq0gpqo+mqlVYm/VVrVTL01VKVbB0EybCqee++Y/Sdsf/nlGl6iKqQ1vvU5GD3592fPV+8JLGFdNpmzU+1D92VmuqYCrCjGff+UFabEq9NIQWElgW0udqErrz47MOE7Vm9ZI7OTLjjE6GQrE5ggEZ0jFaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggIDfAoRa/RZlPgRKQMAwtf2as0ir3NrdLFUB12AJ3Eslb1EFW8/84yel8Yb3eA7YrdZNBTgbb3yPNPyft8jEjw9aAdcDq53S8/Uq0DpkVe3MJEipwo2jB74er27rZaFgW0dKVc1013sJtKrr1bPz2mZfO5pxqFWzQpe5bipcqgKj6jlk01RFVC/BUFXBNBs3tT8VCq67ekdG26y9sj8eKlXh0qVadd+2pV5actxrkFnd64gV4o3ePLDknLyAQELgfZva5CM/O57oxr9rdVZR9kBIxApo0rwJmFNj3i7gbAQQ8EVAc/8Hii+zMgkCCCCAAAIIIFC5Avz9qnKfPXeOAAIIIIBAvgXe8OkH8r0k6+VBgOeaB+QCLMFzLQB6HpbkueYBmSXyLqDnfUUWRACBggoEeq64yfr34m73JgY2ZV7B0n0t/cIJqGDh0IOfjYffMgl3+r1TVbEysuMWaX3/n2b8Ee9+7MFLoDWx3sxzhzP6WPnE+eq7qtyZSQs2tWRyWvwcFbDN5lnFRoYyXkM9l1w2Fbg8+09/mXWgVe2txmMwdMwKJWfbVOXTTJt65itVuq3qvTTT6eLnqWeu3n9emwryqkAuDYGVBG64uEnaa1y/q6bpoje2rXQpr18QaH/DRbaFOTNpH3OAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJBfAUKt+fVmNQQKLqBpxn73JnaubZS1dWH3MP0SElDht9OfuzsegMsmMLnaW1UhQPUR7/Vvu361U614fTaB1sSk8+cyD4aqazKuwGoFIc+//EJGX9PPPJ3YjqfvsXPLf2y9ezJVCTUXbfb4URn+2n1ZBXOT91P9piuTu8seq2e+mve1qrqqAuCZtvC6zUueqkKvXkOtEz95TJar/LrkYtYLKpCrQrE0BJYTUJXWd/ek/nKK3miNWeFW2soCXRs2Lp4U5O9EixgcIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvkVcJV0yu/irIYAAvkVCPVu32qaRr971d09ze4h+iUooEJ/owe/Hg/B1W67Rup++515rZ6qwn6qaqtqE08cyJmg+sj7bAOOcyePrViFM5uNq8ql6iuXzWuwMdMqs173PPLIg1kHNBNrqcBt1bpNie6K32etwPBq2/zZkxn/eQh3L723sLVvr5VwV/PeUGHY8R8flMbrblstAdeXucDtPS3yhedOyfmYuXingZBodVExJ4YXxzhKK9C1vlf+1xM/ir+mVdelPYdBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHcC1C6KffGrIBA0QgYhrHXvZkt0Wrp74y4h+mXsIAKwU3+x+Ny+t5PWB8R/xlRVVyzDYFmw6CCrdUeP1reyzrm3JyX0x3nqmAjbXUCq/FPrKwq+3ppcyePezk97bmeKrVa+1sqFBzwWAE3NnJWVKXZ1bSZZw+v5nKurRCB9pqgqMrr7qY3rXEP0U8j8IZLuu3ReKiVCre2BwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAPgWo1JpPbdZCoJAC3ds7NC12h/U5xI5d7NmQ+nHFjhPolLSA+qh49aWa+sh0FTatsb5y9dH0Caymd++1ArV/ueowX2I+v76b01N+TZXzeYJtnRKoXwycBzsvzvma+VrAazBUVR2usaoPr6YlW640jwq0qj8jKpDqbnpDamjQfU5yf/4MQepkD45zK3BX3xp59NURxyJaVQ90/ywAAEAASURBVK2okKY5M+kYp+MUqItEpLG5RUaHh6y/KukLZtPjzpPoIYAAAggggAACCCCAAAIIIIAAAggggAACFS6gac5/a65wDm4fAQQQQAABBHIkQKg1R7BMi0CxCYSCsdtMK6KRvC9V1W3XuubkIY7LWEB9BLr6Gn30K/GPfq/evC0edFXhSb+bCgVGrI9LH3rws35PXfLzqY+uD3V2xUOTyl63gqvB1k4JNrVIIFoZIXOvwdBQAQK96nmkDbVW1Xh6D3qpEOtpYk5GII2Aqr6uvp4dmXG8qjd1SGzwJccYnVSBizf0yi+ftkKtVtNqGsQk1JqKxAgCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQYwFCrTkGZnoEikKgu7/alLG73Xu5oatJqgL8Np3bpRL6519+QdSXHBRRwcp4FVcVcl23ybfbV3OpuQn1XaiSu+HSnIWIfXtoeZpIBXlLtXkN5M6PLgTkSvV+2XfpCXzo0g55/7+/4ti4Vmv9mQuERGJzjnE6ToG16zdYodafxge1mnrni/QQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBPIiQKg1L8wsgkBhBQKB8WutmmMd7l3s62t3D9GvQAEVOlVfk08dilcPrbnsSql789vjgdTVcqh5Rg9+fbXTlOT1NW+6UtSXCgyryrW0RYFAfeNip0iPFqrmWsFvV1OVdr00c3rKy+mci8CqBfo7I6IqsZ+enl+cS9NFj7aLMXRicYyjFAFVqTXRtOo6669OuohpJIb4jgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQBwHrX2ppCCBQAQIpVVp3rm2Mh14q4N65RQ8CxsRoPNx6+t5PyJl//ORCNVcP17tPVQHZSgt0qjBkyx98WJrevVeq+7ZV3P273wPp+noJhFrT7VuNaR5DrUvNwzgCuRJQFdjv3NiWMr0eaVkIaaa8wkBCINLUIu1vuGihawVa4xVuEy/yHQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgLwKEWvPCzCIIFE4g1Lt9q6bJVe4d3NWXGnhxn0O/sgXmBo/J0IOflaF/+VuJjZzNCkOFF8Nd67O6thQvqn/b9dK+/y/i1VlLcf/52nOgwfoo9BJtQSv0RkOg2AV2dTeLCrc6WiAkekOTY4hOqsCGS3/LHtTro/YxBwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC+REg1JofZ1ZBoGAChmHsdy++JVot21qsj9WlIZCBwPkjv5KzD9yzqmBrBsuU9CmqGm3LwMcksuMWKrNm8CTnzw1lcFZxnmJMTxXnxtgVAkkC7TVBufGS1ACr1tiedBaH6QTWbe6zh6nUalNwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORNIJi3lVgIAQTyL9C9vUPTYrdZnzfsWPt9m6jS6gChs6KAqtQ6/LX/Lm1/+Ocrnus+Qa+Aj2uP3jwgVes2uW/dU3/+zKDMnz0pcyePx6+bffl5x/VVGy4VVQm2HJo54y0YOnv8qJjzc3m99djEaNr1jJnptOMMIlBsAgMb2+SbR4cd29KqakWraRBzetwxTmdR4A2XrJPG5hYZHbbC91Z1W626TsyZycUTOEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyKkAodac8jI5AoUVCAaNO6xAa3XyLlT1thsuTq3elnwOxwikE5gbPCaqamtV76XpXl5yTG9oXPK1cnihum+b1LzpSs+3YljBzpnnDsv0L5+W2ZdfWDG0GYi2el6jWC9YKjC61H5Hv/sVUe+/YmjGEmHXpfamqvjSECiEgKrKflV7vfz09IRjeT3aLjFCrQ4Td6d7U5/850/+PT6s1UUJtbqB6COAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI5FCDUmkNcpkagoALd/VaYdWy/u0rrru5mqQo4K7cWdJ8svqKAqgJau+2aFc9LPuHcw/fHw5LJY34cn3/lBc+h1kC0xY+li3aOhnfc6HlvU4eflLEDXxcVbK3EZkyMebptvT7i6fxcnuz1mZX7+z+X1sy9eoGBja0poVYV0lQVSCWW3+rHq7+b/M3Qe+lljlCrDJ3I3+KshAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFS4gF7h98/tI1C2AoHAxE1WoLUj+QZVmPVO6+OIaaUlEBuxPgLZYwu2dXq8IrPT588MZnZihZylqqeGOi/2dLcTTxyQkW8/WLGBVoUVO3fWk1mgvniq/XoN5AaayqfCrqeHxslFIbBzbaOoCu3upjc5/nrgfrni+109G6W6pibuoIWrRQuFK94EAAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBfAoRa8yXNOgjkWcA0zbvdSy4VbnGfR7/0BYKtBJby8RSr1m3ytMzc4DEZO/QtT9eU48leq52GOrqKhsEYH/W0F/4seuLi5BwI7OtrT5lVb2iyfu+F/wxIgbkwEAwFZd3mLfbLWgPhdBuDAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCDHAvxrdo6BmR6BQgiEerdv1XXZ6l77rr417iH6JSDgtaqluiUVttSC1sdL+9yyqQDrNQTo85ZzOl2oa72n+Sd/8pin88v15NnjRz3dWthjeNjT5B5P9hzItSr56tW1Hldxnh7qLJ5Qr3Nn9EpB4IauJlGV2h0tEBK9kaCmw8TV2fRbV9gjeqTFPuYAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AqmfR5rb9ZgdAQTyIhCzqrQ6AyxbotWivmilJzB38rjnTevWx7XXXtkvk08d8nztcheE13oLcaq5YpNjy01Z0q8FWzs97X/2NW9hTk+Tl9DJxsSonH/5hXj4OpNth1Qw1HpPq+uybY3X3SZ1V+/I+PKz//QZSRe+Vfv22lQod+a5w14vi5+vwunRG9+T1bVchIASaK8Jyq51zfLQb4YcIFqDFdQcOe0Yo7MooCq1VtfUyMz0tEgwLFpNg5jT44sncIQAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBMBKrXmhJVJESigQPf2DtOUm9w7oEqrW6R0+uoj671Wh1R31/DW6yQQ9a8SX3XfNlFfXtu8tf9ybXp1jadby6bqbmKBUEd5Veuc+eXTiVvL6Lt6P6+mea0yPH/mZNrlEoHctC8uMVi77ZolXll5OLLjlnigd+UzOQOBpQX2bEj9WaBV1YpWG1n6ogp/JRgKyqati9VatfqmChfh9hFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CBBqzY8zqyCQN4FgcH6/VaXVUZJVVWnbubYxb3tgIf8FZrOoDqkqW7a+76O+BFvDXeslevOA5xsz5+fSVrv0PFGRXqCqaHppgabUYFkm16sKo16qjGYyZ6HPmbYql6r3R6ZNVR72GkxNnjtkvYczbSq4ulyQ/PxvfpXpVPHzVBhcVZv12hp+5/fK7rl7NeB8fwRUpfb+ztQAq97Y5s8CZTrLpdvfYt+Z3tBs/fWK/3SyQThAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciTAv8zmCJZpESiIQHe/FWbV7nCvvbunVaoCmnuYfgkJTB1+MqvdqkqtbXs/Hg/GeQ1gqgX16lqpf9v10jLwsfix102cP/IrT8FFr/MX+vzYiPPjvFfaT7C1Y6VTUl5XVT4br7stZbzUB1Rw1EtYW71/m259f1bvQxUOVe/lTNvc4PFlT51+xluVWTVZ4w3vWXZO94tqzw3vuNE9TB+BrAV291ihTFfT6qKihcKuUboJgTdcsk4am1sWunpAlBcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AsHcTs/sCCCQT4FQaOw209QcqTkVZr2950IgI5+bYS1fBWasqpYqBKiqr3pt6hoVilQf366qY6og4ezxoxIbOeuYSoX+Qp1d8TVURcxAtEVq3nSlZBOGTUycbRg3cX2xf5+3DKs8bLJue7+oZ5lJU+6R628TdU25NvX+qOq9NOPbU9VOm969V4a/dl/GYemqdZs8h0PnBo8tuyf1Z2f+zKCnyrGJasejj35l2b2rP3vqz6sXl2U3y4sIXBBQFdu76sNyfGLWYaI1rhHz7PJBbscFFdZ545vfIk/94ED8rvVIi8QmhitMgNtFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE8itAqDW/3qyGQE4FYjFtv+6qv3zjJU3SXsMf9ZzC52ny0QNfjwf6sl1OhVtVQDJfIUkVDMw0wJntPRX6Oq+VWlVQse7qHTL51KFlt67Oa3jH74kKQpZzm/7l01JrvSdV8DTTpmza9/+FjP/wUVkuNK3e7yrIXXtlf6ZTx88z5+dk4iePrXjN1P9+SiI7blnxvOQTVNXdYGunTP7kkKh7T7REiLy6b5uoc1YTJE/MyXcE0gncubFVPvWL1x0vqaCmMXRCxDQc43QWBN505WKoVauNiARCIrE5eBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciRA0i1HsEyLQL4FQhsuv8oU2eped2Bjm3uIfokKxAOAVuCtVCo4jn73KyUqnfm2V6romW4mVYWzevM2mfjxAVEfcx+vwHuhSm4g2ir1VhBThRwrpZ17+P54SFVVCs60KafozQPS8Ds3yvy5IZl95QX7Ur0+Eg+OqkBwNuHQqacfjz8Te8IlDlQwuW77262Kxq1LnJF+WO0r3LV3VQH19DMzisDKArd2N8tf/+egnI9Zf2NIND0gemOrGCOnEyN8TxKINLXIGy5ZJ6+/+nJ8VG9owirJh0MEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAbwFCrX6LMh8CBRKw4il3u5fe1lInW6LV7mH6JSygAoAtAx8V9THsxdxU4G/2+NFi3qIvezt/5FeiPorea7BRVSb1Up3Ul80W6SQq1Dv66FeyCnkqd/Xll6Wq0jr+44MZSalzVfXk5t0fyOh8TkKgGAQi4YDs6WmVB14849iOFrF+AYZQq8MkuXPp9rcshlqbOsQYPUtl22QgjhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEfBVwfVO7jzEyFAAL5E+je3mEtdq17wbv6qNLqNin1vjEzJUMP3mNV+DxWtLeiAq2jB79etPvze2OZfFS9H2uqEGW5NlWFeOrwkwW/PfXeVSHbTNvMc4dFfeW6qT/3519erEab6/WYv7wFbu9tSblBLVwtWl00ZZyBBYE3vvkqqYtEFjqBkGi1F44BQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwXYBKrb6TMiEC+RcIBk2rSqvmKMnaXhOU/k5CF/l/GrlfMRFsLcaKreM/elTGf/ivuUcoohWmf/GkNLzjRtGra3O2K/XMR779YFlXBR2zqp76WXXV68NQodqxQ9/yelm8WmvYqrybq+evwsznvnafhNdt9q0ireeb5IKyEuhpqJKdaxvl+685A9yBzp6yuk8/byYYCspb3rFDfvivC/8fgZWfusyFAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIOAWo1Or0oIdA6Ql091thVuM298b39bVLVUBzD9MvE4FEsHX2+NGiuSP1EfKVFmhV+OpZDP/L30quKqkmnnU+KoIW8s20cJ+fFRWMzndTgVYVGs6mxUbOxqsne6nw6mUdFfZVVVqVDw0BvwRu6W7ya6qKmWfz1itEhVtpCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FSDUmltfZkcg5wKh0PgdVpXWjuSFVJj1hi4CK8km5XisQm5n/+kzMmxVcZwbPFawW1RhyzP/+EmZ/I/HC7aHQi+swsXnHr7f920kAq2FfL6+39QKE6pgtHpP5yPEqYLIE08cyDrQmrgV9XzO3P+XMn9mMDG06u9qbyoonvhzZU4Tal01KhPYAqpSq6rYSstcoC4Skd/67bdmfgFnIoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAghkJUC5oazYuAiB4hGI/f/s3Qt0HPV99//vzO5Ku7qs7rJkW0ZClm/Bxo5j10CghhOuCeAGp/iPnWKcUzC0T2hLW5q2OTlp+vRp+jR92ue0ef5NT8i/TdqSPqEtpIUGcoBAuAdIwAkXAyYYfJFtWZatiyXtzn++I2u9O7uSZla70l7ev3M2mpn9zW9+v9esHHH00Xdicpvpiqdv7WqU1gjf3oVzl/I7Ew2V6iu8cp3UXbPNeYR7fq84MbpeU6tqllPgcjpX9ThmV2xt+OQuMWvqpuvq6T2tHjrw8H2SXAFUt3MxtqcJzGMntTzyN190Ps/6uc5HG371eTlhV0FN9p3NdbRi69Gv/Yk0/spvSkXHubMZypmTBnuTKzHr+DQEcimwvadJ/uilA64hrZHx8Xi7vPvjftcbJbkbXPphO4k+8YdBgwMDosHV6dovXHq5vPjEY4kuhmGuG9v7wo8TB9hAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJi1AKm3WRMyAALzJxBc+pHNItZa9wx2LG12H2K/DAQ0CKivyOqNUtG1XCo7l0uwpT2nK9eQ3ei+12V4zwuEWTPInt67R3r/6g8laoeLq9ZdlKHHzIfUeOChf04JNE6epdU7y6VpiFODnRrijazZKNUf+cVZf541wHp63xty6smH8vL5nayeXGl//9VcfI1U9pzn63bp/IZeesqZn7tSbTnde19odM5aYEd3s3z5JwfldMxKGsMIh4LmTvtfmr9MOliym5ZljBjGxPJisZn/fdXQa8/qNbL31VeckyzLutneINRasp8QFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS5JgI5E4jfaVcYSxltU2uNrKoPpxxjp7wEtAKlvrSZ4aqzAdf2JWIEQ56rSOqj1GOnBmT03TecIKuGLfMRrBu2K5LqNbw2ayT7x7CPHdrvVJf1ei3t77dpGLH/X+9xqqxG7CqjYTtkrCHHqZr21zCsvkbs13SVQ089833nnk41VvJxr05z6Z88P6/b6jH49CPOS0PaTmh7sV0J1f4sh1rapq1c63yG+4/JyFt77M/wG3kJsmZahwZn9aXzrbn4aqd6crChKWMVZf2+0jUmf99mGnP8yCHPn113IDbTeBxDoDJgiAZbv/7mkRSMuGXYP1uUR6g1ZeEedy684uOJUKv9h0W/EerZ8PdUa/WIRzcEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAg0BqGs7DCXRBAIHCEAivWNs5Ph7Y557N1z7aKVcunv2jz93jsl+aAvqYdA26atMAq4YAafkTmPTWcLCGGWm5EdBqrhpw1TZmhz+nCwbn5orZjxKobxYjFOJ7LXtCzsyhwPuDo3LRd19LG9Gy5Jdib7/472lvlNiBQPf6Z+xKrZt0WZ+5+/PS2LrA0wq/83dflX2v/+xMX+vfx9966Zc8nUgnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEZBajUOiMRHRAoTIHx8aBdSS35kcEirZEggdbCvF0FOyuClXN7a/DOj7eGWE/br2Josf6jxTBN5lgmAourK5yfG773fur3j2EYt9kEJR9qNYz4iF3T3LnbgycHPIdaL/n49UmhVmOLXa11LdVay+SbhmUigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDeBSZ+i5v3y3ABBBDIqUDn5rAdaN3pHnP3ylb3IfYRQAABBBBAAIEpBW7qbszwnnWVBjUzvMEhW6B14SLpWb0mYWFZsS8kdthAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJiVAKHWWfFxMgLzIxAKDuy2r1yffPXKgCGf6swUTEnuxTYCCCCAAAIIIHBWYHN7VFbV238r42qWZd3sOlRyu3ZF2kPZLurCKz6edOpEtdakA2wigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCGQpQKg1SzhOQ2A+BWJxIy1osqO7WaIVgfmcFtdGAAEEEEAAgSIU2Hpupj+Kie+Wzg1tRbgcz1OOx42Ryc4jQ8OTm56+Uq3VExOdEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAd8CQd9ncAICCMyrQLD7w1eJIWmPBM4cSJnXqXJxBBBAAAEEECgCAf3DmP/3tV7pHR5Pmq0RDgbjB2Xp+qRjpbVpGCKWNbGm0yNDvhen1Vr3vvrKmfOMLcGl68+M5nsoTkAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQOCNApVY+CggUm4Aht7mnPNWjg9392EcAAQQQQAABBNwClQFDru1ocB8u+f3JQKsuNDaeHOj1tnR3tVZvZ9ELAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGA6AUKt0+nwHgIFJlCxbP0KEWOLe1o3dWd6bLC7F/sIIIAAAggggEBmgd0rWzO/USZHTw0MZLVSrdZKQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDInUAwd0MxEgII5FsgHjfsKq2pT7btrq2UKxfX5fvSjI8AAggggAACJSzQGgk6P0987/0TKau0hk9K7IM3U46Vyo4ZbRKztdNZzsn+vqyWpdVaV2/cJK8+/+zE+eOjMv7zn9o/rsWzGo+TEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIUCC5dn1g2lVoTFGwgUOACnZvDIvHd7llu72lyH2IfAQQQQAABBBDwLbBrWXPaOUakVozKqrTjpXDAGhtNLCPbSq06wMVXXyfhSGRirGCFmA1tiXHZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYN4FQcMAOtBp2sPVsqwwYsqM7PYBytgdbCCCAAAIIIICAN4FNrTWyqj7lRw3nRLO+1dsAxdYrNpaY8eBAaoXaxBseNqqjUbnwimsSPc2GBWKEKhL7bCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBcg1Ordip4IzKuAJXKbewIaaNVgKw0BBBBAAAEEEMiFwGeWt6QNY9Q0iARCaceL/YA1nhRqPXVyVstZf8ml0rpw0cQYhilmc8esxuNkBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFwFCLWW651n3UUlEOhev8Wu0rrCPentPU3uQ+wjgAACCCCAAAJZC1y7pEFaI8HU8zWkGS3BnzniMREr7qx1cGAgdc1Z7F2+dVviLKO6XvRFQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYFwHDMNKqtF65uE66ayvnZT5cFAEEEEAAAQRKU0ArwG/tbExbnFnfav99TQn+p0NsPLHWgePHEtvZbCw8p0tWb9yUODXQYldrLUWzxArZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHIvUIK/mc49EiMiMJ8CoZ4Na+0yYle553BTd3rgxN2HfQQQQAABBBBAwK/ALctaRMOtKS0QEqMqmnKoFHassdOJZZzKQbXWi6++TsKRyMSYwQpxwsCJK7CBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIDCTAKHWmYR4H4F5FrAs62b3FFbVh2Vze+kFS9zrZB8BBBBAAAEE5l6gNRLM+HOG2dA295PJ9xVjY4krnOjrS2xnu1EdjcqFV1yTON1sbBejIpzYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYHoBQq3T+/AuAvMr0Lm2XiS+2z2JredSpdVtwj4CCCCAAAII5E7g9pUtaYMZ4WoxKqvSjhfzAev0cGL6Rw58kNiezcb6Sy6Vhed0TQxhmGK2ds5mOM5FAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykqAUGtZ3W4WW2wCoaC5U8RIKe8VrQjIju7mYlsK80UAAQQQQACBIhJY11QtWhne3cyGBe5DRb1vjY4k5t935FBie7YbV2/bIcFQ0BlGw8Bm48LZDsn5CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQFkIEGoti9vMIotVIG4Zd7rn/qnORqkMGO7D7COAAAIIIIAAAjkVuH1leoDVqLaLyAdCOb3OvA42Ppq4/LHDhxPbs91obF0gl12/NTGMhoFLrcptYnFsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAjkUINSaQ0yGQiCXAoHu9VsMQzrdY+5aTpVWtwn7CCCAAAIIIJB7gSsX10lrZKLaaGJ0wxSzriWxW+wbKZVaew/L+Nh4zpZ0/gUflY7unonx1K31HLsAP//5lTNgBkIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZIU4LeqJXlbWVQpCBhGPK1Kq4ZLFldXlMLyWAMCCCCAAAIIFLiAVoa/qTv9j2nMOvtYqYQzrbikBFuP5K5aq97ea/6fHRIMTQSDtVKr2dBW4Hed6SGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzK0CodX79uToCGQVCPRvWipib3W/uWpYeLHH3YR8BBBBAAAEEEMiVwPbuJtFwa0oLhMSork85VNQ7YyOJ6R87fCixnYuNaEOTXHb91sRQZsMC0XArDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgswCh1swuHEVgXgXi8fht7gmsqg/LptYa92H2EUAAAQQQQACBvAm0RoKileLdTcOZpdKSK7UeOfBBzpd1/gUfla4VqybGtSvcBtq77b9dCuT8OgyIAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIIAodZSuIusobQEOje0GYa1072oHUup0uo2YR8BBBBAAAEE8i9w+8r0AKtWGzXC1fm/+BxcwTo9nLhK35HcVmqdHPjqG3dIOBKZ2A1WSGBB5+RbfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSBIg1JqEwSYChSAQCsa2iR0TSZ6LVknb2tWYfIhtBBBAAAEEEEBgTgS0Wvy6pvQAq9nQNifXz/tFxkYSl9j/1t7Edi43qqNRuerGTyeGNKrrxaxvTeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwIUColU8CAgUmYInc7Z7StR0NUhkw3IfZRwABBBBAAAEE5kTg9pUtadcxqqIigVDa8WI7YJ0eEonHnGmPDA9L74EP8rKEntVrZP0llybGNps7xIjUJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAQIdTKpwCBAhIIdK/fYldpTSl7pmHW3Sup5FVAt4mpIIAAAgggUHYCm9ujopXjU5phlky1UWv4ZGJp+996M7Gd641LrrleFp7TlRg20GZvl0AwOLEgNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFZChBqnSUgpyOQY4G0Kq0ZQyQ5vijDIYAAAggggAAC0wnoH9ncsiy9WqsZbbL/Hqf4/5MiOdR65GB+KrWqbzAUlGs/vVPCkcgEtx1odYKt0+HzHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFBGAsX/G+gyulkstbQFQj0b1hqGbHKvMtPjft192EcAAQQQQAABBPItsLWzUTTcmtLsUKZZ25ByqBh3rOFTiWm/99bexHY+NqINTXLVjZ9ODG1EasVsXJjYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZwFCreV891l7QQnE4/E73RNaVR+WdU3V7sPsI4AAAggggAACcy7QGgnKdeekB1iNutY5n0uuL2iNjohYcWfYE33HZOD4sVxfImW8ntVr5MIrrkkcMxvbxaiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpVgFBrud551l1YAp0b2gzD2uae1GeWpz/m192HfQQQQAABBBBAYK4Edi1L/9nEqKwSrTZa1M0OtFojg4kl5Ltaq17ooiuvkY7unsQ1A23nilrSEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIWINRazneftReMQDAY3ylihJMnpNXQrl2SXg0tuQ/bCCCAAAIIIIDAXApoFflNrTVplzTrS6Ba6/CpxLref+etxHY+N67dcYvUNTZNXMIMSKC9WyQQyuclGRsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaAFCrQV9e5hcWQh0brbDrNad7rVu7WyUyoDhPsw+AggggAACCCAwrwK7ljWnXd+ori/6MKY1fDKxrrmo1KoXq45GZcstt0o4Epm4drBiIthq8J9piZvBBgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlJUAvy0tq9vNYgtRIBA4tcWu0tqWPDcNs96S4fG+yX3YRgABBBBAAAEE5kPgysV1ohXl3c1sSPlxxv12we9bI4Mi8ZgzzxN9x6Sv9/CczLl14SL5hF2xdbIZ4WoJtHVN7vIVAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCuBQFmtlsUiUIACRkP7Nw1DUlIgH19SL5/qaizA2TIlBBBAAAEEEEDALspqGvKDg2crm6qJEaqQ+Ikj9pZVpESWGJURMSomqqbW1tfL4q7uOVlLQ3OL1NhVW9/+2R7nekaFXcjfrtaaXD12TibCRRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF5EDAbFyauSqXWBAUbCMy9QKhnw1rTlLXuK9++coH7EPsIIIAAAggggEDBCFzb0SBaWT6lBUJi1jWnHCq2HWtoIDHlfa+/ltiei43zL/iorL/k0sSltPKtGW1K7LOBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIMAodZyuMussYAFYne7J7eqPiz6oiGAAAIIIIAAAoUq0BoJyo7u9ACrUVvcIcz4qX670GzcYd//9l4ZGR6e01tw2fU3SM/qNYlrmi1LxKiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBEpdgFBrqd9h1le4Ap0b2ixLtrgnSJVWtwj7CCCAAAIIIFCIAlvPbUybllFZVdwhzHhMrJHBxLre2vOTxPZcbXxi+y5pXbho4nKGKYH2blFXGgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFAOAoRay+Eus8aCFAgGrd0iRkpJVq16duXiuoKcL5NCAAEEEEAAAQSSBbSy/Ob29CqiZl1Lcrei27YG7WqtZ9pbP31lcnPOvgZDQdn6q78mdY1nqt5qsHXRMoKtc3YHuBACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMynAKHW+dTn2uUr0LnZDrPGb3MD3LKsRSoDhvsw+wgggAACCCCAQEEK3NSdoVprdb0YoYqCnK+XSSWHWve9/jMZHxv3clpO+1RHo7Ltjs+eDbaaAQksXFrUrjkFYjAEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWVghC4RCA9vsKq1tyXPUMOvWzvRgSHIfthFAAAEEEEAAgUIS0ArzHTXpAVajbkEhTdPXXKyxUbFODznnaKBVg63z0aINTbLlllslHIlMXD4Qsiu2LifYOh83g2sigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMCcCRBqnTNqLoTAWYFYzLjz7N7E1nXnNEhrJOg+zD4CCCCAAAIIIFDQArcsa06bnxltsv9+p3j/U8MaPJFY009ffDaxPdcbrQsXyY13/IYEQ2d+RgxWiNnWLWJXbqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShQvL9pLsW7wZrKQiC09MObTFPWuhe7a1mL+xD7CCCAAAIIIIBAwQvs6G4WrTif0uzQpVmXHnZN6VPAO9bJo4nZaaXWwYGBxP5cb2iwdcvOWxPBVqOyyq7Yuoxg61zfCK6HAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzIkCodU6YuQgCZwUskbvP7k1srWuqllX1Yfdh9hFAAAEEEEAAgYIX0ECrBlvdzYgW7x/sWGOjYo0MOksaHxuXt376int5c7rftWKVfGL7rsQ1CbYmKNhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIESE3CVVCqx1bEcBApNoHNDWzAY22c/jzclwfq1j3bKlYvrCm22zAcBBBBAAAEEEPAk8PbJ03LZf76e1jd28G2xBvvTjhfDAbOuRcyWJc5UF57TJds/e9e8T3vPC8/KQ/d+KzEP6/SQxD54UyQeSxxjA4FSEbAs+88BaQgggAACCCCQdwHD4FcEeUcukAvw81WB3AimgQACCCBQ8gLl9PMVP1+U/MeZBSKAAAIIFIhAufx8EVy6PiFOpdYEBRsI5F8gGLTsKq2pgdbWSJBAa/7puQICCCCAAAII5FGgu7Yy488zZrQpj1fN79Dxk30iVty5yIGf75O+3sP5vaCH0c/bsEmu2Lot0ZOKrQkKNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoEQECLWWyI1kGUUg0LnZrs4aP5tCODPl3Stbi2DyTBEBBBBAAAEEEJhe4IbOhrQORnW9GBUpBerT+hTsAbv6qTU0kJjenheeS2zP58b5F3xUrt62IzEFgq0JCjYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAEBAi1lsBNZAnFIRAKndxpV2ltS55tZcCQazvSAyDJfdhGAAEEEEAAAQSKQeDKxXWiFVvdzYi2uA8VzX584Fhirnt+9Gxie743tGJrpmCrEaqY76lxfQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGBWAoRaZ8XHyQh4F4jF5DZ3761djdIaCboPs48AAggggAACCBSlwPaeprR5m3XNImYg7XgxHHAqtdoVW7UNDgzI/rf3Fsy0MwdblwvB1oK5RUwEAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSyECBNlwUapyDgVyC49CObRay17vN2LLVDHrSSFFhw15clUO///sZPnZDDX7lbrPGxeXExw1XSeucfi1lT5/v6Ou9Y/1Hf53ECAggggEDpCOzobpa/3HNYBkYngqDOygxTzGiTxPt7i2+hVlziJ/vErJuoNvvik49JR3dPwaxDg62BYEj+69vflPGxcZFghQQWLZfYB2+INTZaMPNkIgjkSuC3v/9YroYqqHH+/GOXps2HtaaRFN0B7ivfr0X3oXVNmM9w+XyGXbee3TIT4GeO4r/h/HtdPv9e8/3K92sxCZT7v03FdK/yMdcDn/9MPoad9zEXfunraXNgrWkkRXeg3O8rP18U3Uc2bcLl/v+55fQZTrv5ZXCASq1lcJNZYiEIxO90z2JTa42sqg+7D7Nf5gIaJq255Jp5U6i99LqsAq3zNmEujAACCCBQUAKVAUM+1dmYNiezfkHasWI5YA2c/YONva++IgPHjxXU1FeuWy9bdt4qwdCZv1fUYGvHKjHC1QU1TyaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBEg1OpFiT4IzEagc0ObiLHFPcSuZf6reLrHYL80BWouvjqrKq+z1Qi1L5GqjZtnOwznI4AAAgiUucCu5Rl+xrGDlkZ1fVHKWKeHxBrsT8z9R088ntgulI2uFatSg61mwK7YuqxozQvFlXkggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMDcCxBqnXtzrlhmAsGgdbd7yR01FXLlYv+Pd3ePw35pChj2Y4Trrtk254vTKq16bRoCCCCAAAKzEVhcnfnnHLOuZTbDzuu58f7exPV/8syTMjgwkNgvlA0Ntt54+50SjkQmpmSYEmjvFrO+tVCmyDwQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBGQUItc5IRAcEZiHQuTksYu10j3ALVVrdJOy7BMIr10llz3muo/nb1evpi4YAAggggEAuBG7qbkwbxqiKilFZlXa8GA5YwydFK7ZqGx8bl9d//GJBTnvhOV1y812/J3WNTYn5mc0dYrYsSeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBSyAKHWQr47zK3oBULBgd32IlKetVsZMORTnelBj6JfLAvIuUDd1dvmpHKqVmeNXn5DzufPgAgggAAC5SuwuT0qq+rtv+1xNaP2bNjS9VbB78b7Dibm+Nxjjzjh1sSBAtqINjTJr/zW74kGXCebVsnVqq1iV2+lIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUswG81C/nuMLeiF4jFjZvdi9jR3SzRioD7MPsIpAkEW9qlauPmtOO5PqDX0GvREEAAAQQQyKXAjqXNacOZdfaxQCjteDEcsAb7RWJjzlQHBwbsaq0/KthphyMRufH2O6Vn9ZrEHI3qegksXi5i8nNoAoUNBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoOAFCrQV3S5hQqQgEuz98lWnKWvd6tp5LlVa3CftTC2gFVbOmbuoOs3xHx6ZK6ywROR0BBBBAIKPA1q5GaY0EU9+zK4WatQ2px4poL378UGK2zz1auNVadZLBUFC27LxV1l9yaWLORmWVBJesEv1KQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBApRgFBrId4V5lQaAobc5l7IlYvrMj6K192PfQQmBYxgSKJX3DC5m/OvtZddJ3oNGgIIIIAAArkWqAwYcm1HeoDVbGjL9aXmbLz4iaMi8Zhzvb7ew/KTZ56cs2tne6HLrr9Brti67ezpwQqnYqtRwx9anUVhCwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEECkWAUGuh3AnmUVICFcvWrxAxtrgXdUNnerDD3Yd9BNwCVesukoqOc92HZ70fal8i1Rs2z3ocBkAAAQQQQGAqgd0rW9PfCoTEqK5PP14MR6y4xPsOJGb63GOFXa11cqLnX/BRp2prOBKZOGRXzA20dYnZ3DHZha8IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQQgQai2I28AkSk0gHjfSqrR211aKVmqlIZCNQN21n87mtGnPyceY016QNxFAAAEEyk6gNRLM+POPWZ8h7FokOk611tiYM9vBgYGiqNaqk+1ZvUa2f/a3pbF1QUJa70Ng0TIRM5A4xgYCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwnwKEWudTn2uXpkDn5rD9XNrd7sVt72lyH2IfAc8Cua6qGlm9MS/VXz0viI4IIIAAAmUjsGtZc9pajUitGJVVaceL4oBWaz1+KDHVYqnWqhPWQKsGWzXgOtn0XgSXrCre+zG5EL4igAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBJCBBqLYnbyCIKSSAUHLADrYYdbD3bKgOG7OhOD3Sc7cEWAjML1F52nZg1s6/2awRDUnfNtpkvSA8EEEAAAQRyILCptUZW1af8aOSMSrXWHOBmMUQ4EpEtO2+VSz5+3dmzgxUSWLxcjJrGs8fYQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBOZBgFDrPKBzydIWsERuc69QA60abKUhMBsBDbTWXnz1bIZwzq29NDfh2FlPhAEQQAABBMpG4DPLW9LWatQ0iARCaceL4kCGaq0jw8NFMfXJSf7CZVfItjvuFA25Os0wJdDWJYEFnSJmYLIbXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIE5FSDUOqfcXKzUBQLd67fYVVpXuNe5vafJfYh9BLISqL7wcgm1L8nqXD1Jg7E6Bg0BBBBAAIG5FLh2SYO0RoKpl7RDlGa0eH9Gip84KhIbc9Y0ODAgTz/8YOr6imCvo7tHbr7r96R14aLEbI3aJgme8yExqqKJY2wggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACcyVAqHWupLlOWQgYhnWze6FXLq6T7tpK92H2EchaIHr1tqzPrb/u02IEi7QqXtar5kQEEEAAgfkW0Ir1WzvTH21v1rfafw9UpP9J4qrW+pNnnhQNtxZbizY0yfbP/o6s3rjp7NTtCrqBhT1iNncU7/05uxq2EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgiARc5ZKKaOZMFYECEwj1bFhrWXG7Umtqu6k7PcCR2oM9BETip044VVS9WFR2LZfI6o0y/OrzXron+uh54ZXrEvvTbfiZz3Tj5Oq9YEu7s2YdL9TWIUa4SoJ2CCdQ35y4xPiRgxI7ZYeJxsdk5K09MrrvDRk7+F7i/VxtVK27SAINZ6873bix40dl6OWnpuvihIz1fuqYZk1Ugs3tYoYjTkXe0f3viGWvR+/HyGsvy+m9eyQ+MjTteDO9qWaRNRud6wab25zPXcC+rl5fr6dt/OhB51p6zflq+nmt6JoofF2x+FwRO4wdau+wbaok1n9Uxo8fc1zGjx5yjEb3vZ6Yfz7m7Oe+62dxpu9Pvef6uZ78PE+u7fBX7pZAbVQqe1Z7XoZ+Rk49kdsKkZPz8zoJ/azk4/vN6/XpVxwCtyxrka+/eUROx6yzE7bDk1oR1BrsP3usiLa0WqvZ0CZir2N8bFyesqu1XrE1+z8+ma+lB0NBuerGHbL8/A/LQ9/+ViKcq6FjvT/x3nfFGhmcr+lxXQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCMBQq1ldLNZan4FLCu9Suuq+rBsbufRrfmVL43RTz76gESv2ea5imqd3VdDZBpm89q8VnjVMXU+dXZV1/lsWlFWg3VVGzZLRYcdapyhaUBQX9oqe85zvjphUDsIquvRIGQuWsQOtWrg0ks7bQdrpwq16lyrL/iYs0YNamZqyetWC2163088eK/v9TiWOvczNpmuN7ku/Vptu+tnYej5x+XkYw/MOkyb6XruY15M9BwN5iYHmifH0fs99NJTMvjCD3z7TI4x1Ve/9z1TqNWsqZOqD19k2/5ixvlPXlsDu82XXje56+lrrkO90StumHaO7kmpOw2BmQRaI0Hn56LvvX8ipauGQmNFGmoVu1prrPc9CbR3O2v6yTM/lLUXXiytCxelrLFYdrpWrJJdv/t5+Y9vfUP2vf4zZ9pGRVgCi1dI/PghiR/7oFiWwjwRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBIBQi1FumNY9oFJtC5tt6utbnbfj5rysS2nkuV1hQQdqYU0MqYGh6svvDyKfskv6HhuFo79DbwyH3Jh6fc1nFD7UumfD/5DZ2Hzme+mgYbay6+2gl7arB1Ns0JEdpBTq2yqeHSXIZbvcwr0/x1Tg2/fJvnYKz7OlptV19amdPL/de1a0BRr+u36fz1sxOxg5gDdpB2qoCu33Hd/TVAW2UHPb1+Rt3nT+7rGmsuucZ5adXZwWcembFi6uS5+f6qjnVXe6veqOFcrco7XQDZPd/winU5q1Sr9yFTaNh9zcl9tc5VaHxyTL6WrsBvnNcm7lCrEa4Wo7JKrNOzq0Q9X2paZdYaGnAqmuocHvrnf5Cb7/rcfE1n1tcNRyKy9VfvkD0vPCuP3X+fjAwPO2Nq+NiI1E5UbR0dmfV1GAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDIJGBmOsgxBBDwJxAKmjvtQGs4+axoRUB2dHt7RHnyeWyXr4AGFDXM5rVpSM5L8EyrgNbaIVEvTa/vJSjpZaxs+mhYs/nW33dCqJkCodmMOXmOhjsX3PVlz8HhyfNm81UfI5/cdH2td/5x1oHW5LE0vFn/yV3Jh1K2NeDZ9Cu/6fTJJtCaPJh+hvRaes1cNh1X56hVgWcbaHXPS6vcani48aZfl6kq4brPyde+rs9roHVyDn4DxJE1E5V8J8+fzVf9nPppw3ZgnIaAVwGtYq8vdzMbFrgPFdV+/Oh+0aqt2noPfCBasbXY23kbNjlVW7V662TTAHKgY6VowJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5EOAUGs+VBmz7ATilnGne9Gf6myUykBq5VZ3H/YRSBbQR73rY+W9Ng191l0zc+VHrejqNdSolUx1HvPRai+7fk4CiBou1JDhXDe9Zq4DlhrUzRSW1ONOeLbnvJwuM3r5DU7gOBeDaoi15de+4KsaaTbXnQxK5zo062Uu+j2qoV2tROu3jbz2sq+QuwbcNcibi1bRudzzMPrvxbA9VxoCfgRuX5keYDWq7aL3gdlV5/Yzh1z3tezKpfH+3sSwTz38oAwODCT2i3WjOhp1qrZe8vHrJBg685APwxSzaZEEFi0r6ntWrPeEeSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ6gKEWkv9DrO+vAsEutdvMQzpdF9o98pW9yH2EZhRYPjV5309QlwDe9NVVdQgX9XGzTNeVzuMHXxPBl943FPfXHbSKpoa9tTw7Vw1DRlq2DDX1WCnmr+GQbMJNk41XvJxrdhb2XU2hKjhYK2qmq/qpFE7SO01JJ08z+RttdCKvF4qDSefl+12sKU9UQE42zH8nDf5udLKtpVZBoudsOgrz/u5rIRX+Kuwmmlw/dwkf54y9Uk+NrrvDV/h2+Rz2S5fgSsX10lr5ExAcpJBg5J1LZN7Rfk13ndQJDbxhyEaaNVga6m0X7jsCrn5tz4nC8/pSizJiNRK8JwPiRltShxjAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJitAKHW2QpyftkLGEY8rUprxrBG2UsB4FXgxHe/6bWr008Dk5MhOveJ073n7uv3uu7zs93XR8RPF8zNdtyZztOw4VxUbNXQqYYb89k0xKqfAa3amu9wsIYe62dR6dapLmufP9VnNl9Oej110uvnuwVqo05gt+biq2d1qaEfP+3r/Miajb76Z+rs93tRg/g0BPwKaCX7m7qb004z6+xjdri1aJsVl1iNSsl3AABAAElEQVTve4np/+SZH0rvgQ8S+8W+0di6QLZ/9i7Rqq2JZgbEbO2UwMIeEXubhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMxWoIh/azzbpXM+ArMXCPVsWGv/Bn+ze6Rdy9KDGu4+7CMwlYDfiqlahVKDk+6m4TSvVSL1Ueej+99xD5H3fQ0Yep1jPiaj189nyFErmmrQNN9NK55q5dNMn4N8XFs/W9lUa9V5aqXX+Wx6/bmoEFtnX2e2wV39t0BfXpuuq6LjXK/dM/bz8/3oVJMl1JrRkYMzC2zvbhINt6a0QEiM6vqUQ8W2Yw32izU0kJj2I9+5N7FdKhtatVXDrRpynWxGVdSp2qpfaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMBsBAi1zkaPc8teIB6P3+ZGWFUflk2tNe7D7CPgS+Dkow9IfGTI8zlanTM5ZKhhOq3S6qVpMK3/AX/VYb2MO1MfDeBlUyk11n9UtDrkyccekGP3/JnzOvHQvTL49CO+AoCT89M5aDA4H222oUY/cwq1L/HTfdZ9ay74mK8x1EKr8mql1/lsen2dRz7vjX62/VY8ncpk6IUfTPVWxuPhFesyHvdyUE38hFo1DK//ftAQyEagNRKU685pSDvVbDgblEx7s0gOxI/uF7Grtmo78PN98tyjDxfJzL1Pc+E5XXLzb31ONOCaaHYoWSu2Btq6xAhVJA6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCPgRCPrpTF8EEEgS6NzQZhixnfZzcpMOiuxYSpXWFBB2shKInzrhhDa9VvnUMJpWhjz+L3/rXK9q42bPQc1TTz4ker25bvooeD/BQg3PaXBVw6zuIN3pfW8kpq+VV7Uap9fwpM5BA8B9//TXiTHmckMrccZHhhOX1EqbflwSJ3rcSLbS62Rb2VMfdT/wyH0erypSc8k1WV9Lw5Njh/ZL7PhR0VBz0A7w6v2t6FwulV3LPc9hsqOuWedz8tH7Jw8V7FcNcOvn2etnwu99SV64unj9vtHzhl5+Kvl0thHwLbBrWYv833f6Us4zKqvECFeLNTKYcryYdqzREYn3HRSzaZEz7acfflB6zjs/pbJpMa1nqrkGQ0G55OPXSdeKlfLdb31DBgcmKtQaNY0SsCvuxvt7JX78kEg8NtUQHEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTSBAi1ppFwAAFvAqFgbJtlxy6Se2vVsa1djcmH2EYgawENcFatvVC8VuCMrN4og888IuPHj3mu0qph1lNPPJj1HLM9sXrDZl9hRJ3n0Xv+p4wfOTjjJTVoN7J3jzTv+h3PwV6tTqmVbucq3KtB1sFnvp8xFKihwuoLLxetvpurpiZDLzwuo/vfSRtSq9TWXftpX/dDB9FqpDpXLxWF9TOczXp0vie++820CrzJwVwNYur8vX6fTALofJywrH0v5rJpIFuvO370kPN5nvzMjR05lPHzp77aX7+/vTS9L2qS6V7PdL6fKq8679GkMPlMY/M+ApkEtLr9uqZqeflYaoDVbGiT2MG3M51SNMc0zGlURcWI1Mr42Lh895v3yPbP2v+/ZAdBS611dPfIrt/9vDzxn/fLT5754cTyDFP0PprRJifgGz9xpNSWzXoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBPAmaexmVYBEpewBK5273IrZ2NUhlIrdzq7sM+An4EBh661093J9wXveIGz1UdTzx4b1rVU18XzLJz2GNAb3L44/96j6dA62R/Ddwd/79fm9yd8atWway9+OoZ+822gwYatZruka9+MWOgVcfXEKNWED3xwDdnezk5bYd7D335t6Tf9psq5KhB4b5/+F+SHBT1euGQHZ700jSc7bdpkPPo1/57WqDVPY6uSz11rX5bNvPye43J/k5A176nh+37oZ8BvcdahVXd9TUZbp3sn/xVA8l+mp9wavK44VXrknen3R6274+7YvK0J/AmAlMI3L6yJe0dDYOK/Sj7Ym/x3ndFrLizjN4DH8hzjz5c7Euacv7hSESu2LpNbr7rc6Ih10Sz76PZskQCHSudkG/iOBsIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDAFAKEWqeA4TAC0wkEutdvETHakvtomPUW+zG6NARyKaBhNw33eW1arbJq3UWeumvITkN1c920IqqfR8ZrldFsAotONVS72q3XVrVxs+cwsNcxk/tpWPXYPX/m2XzQDjJ6qUybfI3JbQ0bDjxynxyzw6rThSVT+vsMUOu5WuXVS4us8VZldHIsXbcGP/007R/rP+rnFPE7L1+Dn+msa9HQrQZ09Z56qWzrvo7+O+BnbdmsSyu86strG5mHfzu8zo1+xSWwuT0qWuk+pWmVz/rWlEPFuGONjUr86PuJqT//2MOi4dZSbq0LF8m2O+6ULTtvlbrGpsRSjcoqCSzscV5GRcqDDhJ92EAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRUgFArnwMEshNIq9KaMZSR3dichUCKQL9d3TEfFRH1se7z0SIrvVeD1PkN2NVks20a7PTatFqrPrY9H00DiUe/9idTVkud6prDe16Y6q0pj+tnpe+f/lpOPfHglH0yvaEhYA1P+mlmuGrG7pU954kGmf00rbLr9zOvYdG+f/obP5dx5qXzy1fTMLbed7WdbRt6+WnPQ2g41e9n2U+VVv08+/2seJ48HctOYKo/CtLH1osdbi32Fj9xRKzhk84yxsfG5aF//gfRr6XeelavkVv/4Ity2fU3iFZxnWxahTew5ENiNneURDXeyXXxFQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEcidQ/L8pzp0FIyHgSSDUs2GtYcgmd+dMj89192EfgWwEtNLmqScfyubUKc/R6qe5CNpNeYFp3giv9l61U+eYTWXLyctrMNJPtdOKrhWTp+bsq67hyN/+ia95TF7cz9z1nMlAazaVbfX88aMH9YvnZtbYjwifoUV83G8dSisIZ/vZ1PP8nut3fjMsN/G2rkPDxbP5/CYGszeGXnoqeXfG7fAKf+FxP/2HX5n7Cs8zLpgORS2wtbNRNNya0vSx9bUNKYeKdSfe+65IPOZMXyu1Pv2wvz86KNZ167zXX3Kp/Oof/JHzNXkdWok3eI4dbm2wH3xQAuHl5LWxjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACsxMg1Do7P84uQ4F4PH6ne9mr6sOyrqnafZh9BHImoFU3/Tx+fLoLa/Bx4GHvFUynG8vve1qxs7JruefTRn1WDs008PjRQ5kOZzxW0el9bhkHyHBQq45qMDmb5mfuOr5eJ9tAq57v93pa3Xa6pu/7DY2eenJ2Ya/BZ74/3ZTS3tP5zbSOtJM8HOj/t3t8V5udblinOqpd+dVri6zxHh7Xirt+KrsO/dh71Viv86VfeQu0RoKytasxDcGoa007VowHrLFRiR/dn5j6c48+LHtffSWxX+obWqlVK7be9odfFK3gmmhmQMymRU641aiuTxxmAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEyluAUGt5339W71egc0ObYVjb3Kd9ZnmL+xD7CORUQIOoJx68NydjDjxyX9Yhy9lOwE9wTq81duhsCCjba/updqrzy3XA0Roby3bqEjt+NOtzszkx19fLxnM2oVxd88hrL/taut5vv59LLxeYzX2fanytsOy1BeqbPa+rsuc8z597/X7y8z3ldb70Q2DH0uY0BKOySoxIbdrxYjwQHzgm1mB/Yur/9e1vysDxY4n9ctiINjTJlp23yrY77pTWhYvOLjlYIYH2bgksWiZGmD8SOwvDFgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALlKRAsz2WzagSyEwgG4zvtZ6SGk8/W6mLXLimNx+Mmr4vtwhPQsJ4G/jSAlm3Tao9Dzz+e7emzPi9gV2r106ov+JhE1l3k55S0voGaaNqxqQ5owFGryeaqKu5U1/F6PFePrvd6PWtkyGtXT/3U0k/TsKQGuGfT1EzHCba0ex7G7zw9D5zjjvpvgK5PK6t6aeEV62R0/zszdvXzbwpVWmfkpEOWAlr1flNrjTzbeyplBH1MfWz4ZMqxYt2JHX5XgktWidghzpHhYfm3e74m2z/7OxIMldd/knV098jNd31OfvLMD+Wphx+UwYEB55ZqgDmweIVYp/okduR9kdjs/v+gWD8nzBsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcor9+glvvdZv2zE+jcbIdZB+60Q60p49zU3SyVgdRjKR3YQSCHAlpltblrueeqiu5La7XX2YYG3WP62Tdr/YUcQ+1L/Ayfk75Bu5JcoYRac7KgeRzE7/0eP3ooJ7PVCr++Qq0+P5c5mWQWg+j37vBLT0n1hZd7OjuyZqPovxkztfDKdTN1Sbw//MrziW02EMi1wK5lzWmhVuex9IFQaQQc4zGJHXrHqUgqhim9Bz6Q7//rvXLVjTtyTVkU451/wUflQx/ZJM89+rA8/9jDMj427szbqGmUoP2yTh6T+PFDYo2OFMV6mCQCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORGwMzNMIyCQOkLBAKnttiB1rbklWqYdXt3U/IhthHIq8DYwfeyrrR6et8bvh/NnuvFmD6qpub62ow39wKBau9VcnV2uXqsvd9wrN95zr3k2Sv6qZQaqG+Wio5zz56cYavSDsl7rfyq/4YQ+M6AyKGcCVy5uE60Ar67mQ0pP3653y6qfWtkUOLHPkjM+dXnn5U9Lzyb2C+3Da1Se9GV18itv/9HsnLd+pTlG7VNEljyIQm0d4tWcaUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB5CBBqLY/7zCpzIGBZ1t3uYaYKX7j7sY9ALgW08mL81AnfQw48dK/vc3J9QsDn4+hzfX0v42kQkJYbAb+VWq3YRJW+2V49ftLf94ffec52frM5X4Pt+vLawiumr8Ja0bXC61Ay8ipVWj1j0TFrgd0rW9PONWsb7L8rKp3/bIn394o12J9Y5yP33St9vYcT++W4UR2Nyid23CI33/U56ejuSSHQar2BRcsksHiFOJV7U95lBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESk2gdH47XGp3hvUUlECoZ8Na05S17kndvnKB+xD7CORdQB9BPvDwzI8UT57I4NOP+ArCJZ+by22zCEKtuVxvuY/lNyAcO340J2T6PeKn+Z2nn7Hz0ddPtdbImo3TTiG8Iu3/2jL2V9NhQq0ZbTiYW4FPdTaKVsJPaYGQmHWl9QcHscPvijU64ixzfGxc/u0bXxP9Wu6tdeEi2XbHnU64tWf1mhQOI1ztVG0Ndq4WM2o/KaGEgs4pC2UHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTIXINRa5h8Alu9VIJZWpXVVfVj0RUNgPgSGXn7Kc0g1PjIkJ598aD6mmXbNQK2/x9GnDcCBohIINtihoyJoxTLPScrhl54Sr8FdDexWdJw7eWrKV30v1L4k5dhUO6P73hD9t4SGQL4FohUB2dGdHmDVR9GXVIvHJH7obREr7ixLK7X+xz/eU1JLnM1iNNy6ZeetctsfflFWb9wkwVDw7HDBCjFbOyXYeZ6YDW0iZuDse2whgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDRCxBqLfpbyALyLtC5oc2yZIv7Or9xnv1LdBoC8yjQ/2/f8HT1k489IPFT/h7H7mngLDqNHz+WxVmcUqwCsZMDRTH1YpnnJKaGS0dee3lyd8av4RXrMvYJ95yX8XimgxqkpyEwVwJbz21Mu5RRWSVGVWn9YYRWao0feS+x1r2vviKP3u+vEnvi5BLdiNp/HHHVjTvk1t//I/mFy66QcCRydqVawbdpkR1utSu3NneI2Ps0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofoGkkjfFvxhWgEA+BIJBa7f9fNOUkqytkaBsbi+tYEU+7BgzvwJjB98TDZpVrbtoygs5fZ5/fMr35/oNy2elx9H973iuSJmrtcQKJACcq/XM5zh+w9RGpGpeput3nvMySddF9Xs/snqj62jm3ciajTLwSHpQLrwyc9jVPYpWhfUTonWfzz4CfgW0Er7+nPX4wdRgvFnXIrGh1GN+xy60/vEB+489KqrErG91pvbiE49JU+sCOf+CjxbaVOd1PtXRqFzy8etk42WXy09feFaee+wRGRw481mwK7Wqn76sk8ckfvyQaGCYhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACxSlAqLU47xuzniuBzs12mHXgNjvUmnLFW5a1SGUg9VhKB3YQmCOB/n+9R/RVLM1vYPTEd78pGsylFaeA3/tthnMTag00pD+6fDpBv/Ocbqy5eu/03j1OBWazpm7GSwbqm6Wi41zRkPhkM4IhqehaPrk77dfhV5+f83D5tBPizbIQuKm7MS3UalTXixGqEGtstKQM4kf3O+vS9Wl79P7vSLShUbpWrCqpdeZiMVqpdf0llzqvPXa49emHH5ITfWerwBu1TRKwX9Zgv8T7e8UaPpmLyzIGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjMoYA5h9fiUggUnUAoNLDNDrS2JU9cw6xbO9Mfi5vch20EEMgsED/lr8KeWUNF5MySxXF0vu63WZn0eGoPVH7n6WHIOeky+MIPPF8nvCK1Kmtlz3miwVYvTUOtNATmWuDKxXXSUVORdlmjbkHasVI4EDu0T6zTQ85SxsfG5T++9Q3p6z1cCkvL2xrO27BJbv2DL8qWnbfKwnO6Uq6jAeHAomUSWLxCjBp+bk/BYQcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAhcg1FrgN4jpza9ALGbc6Z7Bdec0SGuEIsduF/YR8CIQO37US7dEn4CHKpSJzmwUnIDf+x1qW5KTNQTb/Y3jd545mWQOBhl66SnPo0TWbEzpq6FWLy1+6oRoVVgaAvMhcMuy9KrLZrTJ/nujEvxPGCsusQNviYxPVKEdGR6W7/zdV0W/0qYX6Fm9RrZ/9i7ZdsedotvJzQhXS6CtS4LnrhWzZYkYlbmpCJ58DbYRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AiX4G+HcAjFa+QqEln54k2nKWrfArmUt7kPsI4CAR4H4yEQVOo/dJdTW4bUr/QpQIGYHIv20YEtKYWw/p6b0reg4N2V/ph2/85xpvLl6P9Z/VE7ve8PT5QL1zZLsEvYYah1+7WVP49MJgXwI7OhuFq2Qn9LMgJh16WHXlD7FuhMbk9jBt0XiMWcFJ/qOyX12sFUrt9JmFujo7nGqtn7m7s/L6o2bUk9wPjctEuhYKYElHxKzoU2MUHol4NST2EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgfkQINQ6H+pcsygELJG73RNd11Qtq+rD7sPsI4CAR4HR/e947DnRraJrua/+dC4sgVj/MV8TMsNVouHL2TQNbhrBkK8hxo8c8tW/kDoPvfC45+mEV6xz+obsSrZenYdf9l4N1vNE6IiARwENtGqw1d2MaOn+gZF1ekhih87+f+WBn++Tf///vuYmYH8agcbWBXLVjTvkji/8ifzCZVdIdTSa0tuoCIvZtEgC56yWwKJl4lT/tUOvNAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAwBnqFeGPeBWRSaQOcGu1xg7Cr3tG5fWbohCvda2UcgHwLOo8ztypKVHsOqGr4za+pEz8u21V29TaovvNzz6Ue/9t/Fb/jW8+Bl1nH8yEHHMrlC6EwENRd8TE48dO9M3aZ8P7Luoinfy/SG3muteFqsbcSupKoVkDUQPFOLrNkoA4/cJ+GVE+HWmfqrC98LMynxfr4Ftvc0ydffPJJyGQ0lGtX1Yg32pxwvlR1raEDiR94Ts2WJs6R9r/9M/uvb33KCmqWyxrlYh4ZZL/n4dc5LDd/4yUvy2ss/Sql8a0RqRV9mS1ysU8clfrJP1J+GAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzJ0Cl1vmz58oFLBAMWnaVViOlJGtrJChXLq4r4FkzNQSKQ2Dk1ed9TbT24qt99Xd3Dra0uw9Nu1/MVTunXdg8vem30mfVxs2eApqZlqPBziqfoVa/88t03fk8Zo2PybDH7ymtzqoB48ql53ma8tDLT3vqRycE8inQXVuZ8ecvp7pmPi88z2PHTxyReN/BxCxeff5Zefg72Qf+EwOV6UbXilVOKPi/fenP5eptO0T3U5philHbJIGFPRLsWiNmc4doeJqGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzL0Code7NuWKhC3Rutn+DHd/mnubula3uQ+wjgEAWAsN2ZUkN4nltGnL0G0xNHjtkh/i8Nq0Iq1UvabkT8Hu/jWBIai+9LqsJ6Hl6vtfmBELtz2Oxt6EXfuB5CVUbNjvBVi8neA3LehmLPgjMRuCGzoa007VSa6mHDuN9ByTe35tY+0+e+aE89b0HE/ts+BcIhoJy3oZNsvVX75A7vvAnctn1N0hj64LUgQIhMetbJbDkQ85Lt8U+RkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbkRINQ6N85cpYgEQqGTO+0qrW3JU64MGHJtR3qgIrkP2wgg4E1Ag6Oj+97w1tnupSHFhk/dmlX1ztrLrvd13tjB/Z7nRUdvAn7vt45afeHlElm90dsFzvSqtsOaep6fpp9DnV+xt7GD74m+vDSvlWx1vPEjZ6tEehmbPgjkS0Ar5a+qT6+aaURb8nXJghk3fnS/WCePJebz9MMPyotPPJbYZyN7gepoVNZfcql85u7POy/d1mPJTYPTWrVVq7dqFVejptH+wYT/hE42YhsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBXAvwG7lcizJe0QvEYnKbexFbuxqlNRJ0H2YfAQSyFBh6+SlfZ4bal0jDL9/mqwpnZddy3xU/vQYDfU2ezuL3fitZ/Sd3iQZVvTQNs0avSSuwPeOp2cxrxkHnqcPQj5/O6ZWH97yQ0/EYDIHZCmw91w4TuppZ11wWAcPY4XfFGhpIrP7R+++TPS88m9hnY/YCWq1Vq7Zq9Vat4rp64ybRqq7JzaiKSqCtS4Lnni+B9m4x6+xQNRVck4nYRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCAnAoRac8LIIKUiEFz6kc2mKWvd69mx1A5N0BBAIGcC+ljz0z6qteqFK3vOk9Y7/1hmqjRp1tRJ3dXbpPFXftPXfPVR9Kee+b6vc+jsTSCb+60Veuuu+7S03PEFCa9cJ8GW9pSLadBZq7m2fvaPnfut/f00/fzpvEqlDb/yvOhnOFdt6CV/wfNcXZdxEJhKYEd3s0QrAqlv2xUznWBr6tGS3IsdfDsl2PrQvd8i2JqnO921YpVcdeMO+W9f+nO5etsO0f2UZn/ujOp6MVuWTFRw7VgpZuNCMSqrUrqxgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC2Qmklp/JbgzOQqCEBOJ32iW/UtazuT2a8ZG3KZ3YQQAB3wLH/+VvnZCqGfYeAgnUNzsVPGsvu07Gjx+T0XffSFzXrIlKsLldKjrO9VXRdXKAoecfL4lH0U+up9C+9v/rPdLya18QP/db16Dh1cabfj2ny4mPDInOp5Ra/NQJOb13jxMAnu26NPCr49EQKCSByoAhn+pslK+/eSRlWmb9Aon396YcK8kdKy6xQ+9IYNGyRHhSg63aztuwqSSXPN+L0kqtaquvwYEBef3HL8qPn/mh9PUeTpmahlmdQGuj/ccXsTH7389+J4DsVNe17xsNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8CRBq9edF71IW6NzQJhLf4l7iTd3pj7t192EfAQT8C2ho7sQD35SGX77N98kabtVXZddy3+dmOkErXJ588qFMb3EsRwKx/qNOkDTXAdVspqeBVp1PqbXBFx7PSah1+GWqtJbaZ6NU1rNreXNaqFWCFU7VTGuwv1SWOfU64jGJffCmBBavEKMi7PQj2Do1Vy7fqY5GZf0llzqvAfuPava++ors3fOK7H97b+plAiG7enCLiL7sQKs1fMp+nZx4jQym9mUPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQyCqSWpMzYhYMIlIdAcOlH/pf92+ffSF5tR02F/PATK5MPsY2AJ4EFd33ZCV166mx3OnbPn4lWRyyUpmHRpl2/62s6h79yd1ZBwfpP7pKqdRf5ulauO5964kEZeOQ+T8Oqi58wbbYuk5NZ+KWvT27O+FWDmnq9bJvf+z5khx/9Vjytu+7TUr1hc7ZTnPV5GvzUMLXfNtf33e/8Jvu33f0XYtbUTe76/qoB78Nf/i3RarY0BApR4NYfvivfez+1krBWxIwdcIULC3HyuZqTGUip2KrDXr1tBxVbc+XrY5yR4WHZ9/rP5O2fvuJ81f0pGyHXKWmmesOyrKne4jgCCCCAAAII5FDAMPgVQQ45C3oofr4q6NvD5BBAAAEESkignH6+4ueLEvrgshQEEEAAgYIWKJefL4JL1yfuA5VaExRslLVA52a73NXJnW6DW5Y1uw+xjwACORYYePDenFZd9Ts9DWZ6DbT6HZv+6QLO/bZDl+GV69LfzPORkddeFr1+Kbehl56SmkuuyXqJp/fuIdCatR4nzoXALvtnM3eo1aiKOo9/t06XSRh7smLromUTj7234anYOhefvvRrhCMRWbluvfPSdzXgqhVc333jNTnRdyz1BMMU57Nqf16dRsg11Yc9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBM4IEGrlo4CALRAKDuy2xKhPxqgMGPKpzsbkQ2wjgEAeBLQipFaqrb3seqm99Lo8XGHqIbOpNDr1aLzjRUArgfb90187wcvo5Td4OWXWffSaGlwefPqRWY9V6AMMvvCDWYVah199vtCXyPzKXGBTa42sqg/Lz/pHUiSM2iYpm1CrrnyKYOuJvj656Mrsg+0pqOz4FuhasUr0pa2v97Dsf3uv83rP/jo4MJA6njvkar9rDZ8Ua2Qw8ZLYWOo57CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQBgKEWsvgJrPEmQViceNm00ztt6O7WaIVgdSD7CGAQN4ETj56v4wdfE/qP7lLzHBV3q6jA2vIUQOOVGjNK/O0g5964kEZ3fe6NN7062LalVvz1WL9R+X4v/ytjO5/J1+XKKhxdb2n970hlV3Lfc9LA+ZazZaGQKEL7FjaLL//o/dTpmnWNUv8+CEpqxBghmDr0w8/6LgQbE35eMzLTmPrAtHX+Rd81Ln+jCFXu5cRqXVeiQnbodbkkKuGXmkIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLoAodZSv8Osb0aBYPeHrxJD1ro7bu9pch9iHwEE8iyggbojf/NFqbtmW94eT6+VKE/Yj6CPnzqR59Uw/EwCGjTtPXO/I6s3ztTd1/saXNZKvCcfvk80rFlObdhedzahVv3+UzcaAoUusLWrUf7yp4ekd3j87FTtqpdmbYPE+3vPHiuHLQ22vv+GBNq7nUfb65I12Hp6ZFguu35uqmGXA3Mu1jhdyPXAz9+VE33H0i8TCIlRXe+8Jt/UkKuMjYg1ar+0qqv9tazC3JMQfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWZhnAUNuc/e9cnGddNdWug+zjwACcyCglSb18fRavTOyZqNUf+QXJdjSPqsra4BVq1eeevIhpxrsrAbj5JwK6L3RSqonHvimaLC1+oKPzep+a1BWQ50aXi63MOvkjdG1R+1guN+Kx3oeDYFiEKgMGHJtR4N8/c0jKdM1G9rKL9SqAlZcYgffTgm2vvjEYzJw/Jh8YvsuCYb4T76UD0qB7LhDroMDA6Lh1oPvvWt/3Sf7396bcaZGuFrEfhnJ79rhZuv0kP0aTgm86mej1Npvf/+xUluSs54//9ilaetirWkkRXeA+8r3a9F9aF0T5jNcPp9h161nt8wE+Jmj+G84/16Xz7/XfL/y/VpMAuX+b1Mx3at8zPXA5z+Tj2HnfcyFX/p62hxYaxpJ0R0o9/vKzxdF95FNm3C5/39uOX2G025+GRzgN5xlcJNZ4tQCFcvWr4jHZYu7xw2dDe5D7CPgS+DUM9/3FSgbt4MnhdR0Picfe8DXlKwcV8PUsOPg0484Lw21auCxYvG5YqdzJNTSNu0j68ePHJRY/zEZeWuP/Yj7N3IaZNXA5Oi7b3i2ma2Ln/sw2xCn3/s+dvA9zw4zddS5D77wuPMKtS9xKvVWdC53TpvqfmsA2vnesauLjr5vh1ntUKbe+3y0ub7vs1mDVlsds8O9lT3neR7GCX7v3eO5Px0RmG+B3Stb00KtcqaqpTXYP9/Tm/vrTwZb27oSVT33vvqKfPv//JXc8Kt3SDgSmfs5cUVfAtXRqPSsXuO8Jk/sPfCBHLQDrhp0/eDdfdLXe3jyrdSvZkCMSK3zSn5jsoqrE3bV4OvIKRHnq13tlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAgQoQai3QG8O05kYgHjfsKq1WysW0QqtWaqUhMBsBDWMWc9Ow4MlH7y+YJWhQMdN8tJqrBh61jR05JBrMy3fTR9rPZcu07nxdv1Duu4ZlMwVmDTvQXNFxrlN1N18GU4071/d9qnl4Oa7fFxVdE4FgL/21z/ArVGn1akW/whBojQSdn9e+937qv/tmfavEyjHUqrflTLDVbO4QddCmFT//4S/+VLbawVatDEorLoHWhYtEX+df8FFn4iPDw3LkwPvS+8H7cuJ4n/P14Hv7ZHxsPOPCjIqwfTycFnZ1Op+p7irjo2KN2a/REZHY2ETo1a76SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgvgQItc6XPNedf4HOzfZveQd2S+rDO2V7T9P8z40ZIICAJwGnuuQcBFk9TYZOeRfQCqSn7cq7tOkFIms2igaA/bShHz/tpzt9ESgIgdtXtog71OpUq6ysch7FXhCTnIdJxI/uFxk/LRpu1Xai75j84//+c6di68JzuuZhRlwyVwJacbeju8d5JY85ODAgfUcOy3tv7ZWT/X1y7PBhJ9Cc3Cdt+0x1Vz1upL155oATeD3t7ExWe9XwtDVyttKrpQFYOyBLQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBXAoRacyXJOEUnEAoO7LbE0PJFiVYZMGRHd3Ninw0EEEAAAQSKSUDDrLUXX+1ryloJOVNlXF+D0BmBeRBY11Qtq+rD8rN+u8JkUjOizWIdeS/pSPltxvt7neqbgTY7xGqYohU+v/1//kquuvHTsnLd+vIDKfEVV0ejoi8NvCa3gePH7FDzREVX/QwcfO9du6rrmOx/e29yt6m3gxX2H0lUOO9rYNxTSwrCOv2dirDDnk7VTomKsZ7PoCMCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECpCRBqLbU7yno8C1git7k7a6BVg600BBBAAAEEilGg+sLLxayp8zX1wR/9wFd/OiNQSAKfWd4idz1nVyZNama0SeJ9BycepZ50vNw2rcF+iX3wpgQW2kFHuyqnPqL+P771DRk6OSDrL7m03DjKcr3RhibRlzvsqhgacj1y4H05pVVeew/LcbvSq25rO/jePufz4uz4/Z+kIOzkqUZ1/eQmXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCYUYBQ64xEdChFgUD3+i32ula417ZrOVVa3SbsI4AAAggUh0D1hs0SvfwGX5O1xsdk+KWnfJ1DZwQKSeDaJQ3y5VcOSu/w+Nlp2ZVJnWDr8UNnj5Xplj4mPrb/Z2K294hRMfGAgkfvv8+p2KlVW4Mh/nOwTD8aEo5EMoZdkz00CK0BV22T4VfdTg7A6r6GYzUkS0MAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgFwL8FjMXioxRdAKGYd1sP4s1Zd5XLq6TxdUTj9dMeYMdBBBAAAEEClTACIYkvHKdVK27SCp7zvM9y6HnH5f4yJDv8zgBgUIR0Ar7Wzsb5auv9aZMyaxvlXi/fcyKpxwvxx1rbFRi77/uVGw1wtUOwWsvvyjHDh+SX9p1q1PJsxxdWPPMAhp6zlTldaYzB44fkxN9fYluI0PD0msHX720px9+0Es3+iCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSwAKHWEr65LC2zQKhnw1rLimul1pR2U3djyj47CCCAAAIIFIpA/Sd3OcHVXM9n8Ec/yPWQjIfAnAvcsqxFvv7mETkds85eOxASoyoq1mD/2WPlvBWPSeyDN8VsXixmXYsj0XvgA/n7r/ypbLnl1qyCi+XMydqnF4g2NKWFpXtWr5n+pDPvEmr1xEQnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKCkBcySXh2LQyCDgGVpldbUtqo+LJvbo6kH2UMAAQQQQKCEBUZee1nGjxws4RWytHIRaI0ERSvuu5vZ0OY+VN77dtXa+JH3nNdkBVt9ZPy9X/0refGJx8rbhtUjgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDBCBBqLZhbwUTmRKBzbb1IfLf7WlvPpUqr24R9BBBAAIHSFbDGx+TEg/eW7gJZWdkJ3L5yQdqajXC1GJVVacfL/UD8xBGnaqvExhIUj95/n/zHt74h42PjiWNsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS55rwJhILmThEjnDwBre61o7s5+RDbCCCAAAIIlLTAgB1ojfUfLek1srjyEtCq+/pyN7MhPezq7lOO+9bIoIz//KdiDZ9MLP+1l1+Uf/zf/1P6eg8njrGBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIzLUAoda5Fud68yoQt4w73RO4tqNBKgOG+zD7CCCAAAIIlKTA6X1vyOALj5fk2lhUeQtkrNZabRfpD4TKG2aq1cdjTsXWeH9vokfvgQ/k7//if8hzjz6cOMYGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggMJcChFrnUptrzatAoHv9FsOQTvckdq9sdR9iHwEEEEAAgZIUGN3/jhz/p78uybWxKASuXFwnWoE/pRmmmHUtKYfYSRWIH90vsUP7RKy488b42Lg88Z8P2FVbvyIDx4+ldmYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLMAodY8AzN84QgYRjytSmvG8EPhTJmZIIAAAgggkDOB03v3yLF7/kziI0M5G5OBECgkAa28f1N3c9qUzDr7mB1upU0tYJ3qk9j+18QaGUx0OvDzffL3X/lT2fPCs4ljbCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAL5FuC3u/kWZvyCEAj1bFgrYm52T2bXsvTgg7sP+wgggAACCBSzQPzUCTn52APSZ1dotcbHinkpzB2BGQW2dzeJhltTWiAkRnV9yiF20gWs0RGJvf+6xI99kHhzZHhYHrr3W/Kdv/uqDA4MJI6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORLgFBrvmQZt6AE4vH4be4JraoPy6bWGvdh9hFAAAEEECgJgbGD78mJh+6Vw1+5W04+ej+B1pK4qyxiJoHWSFCuO6chrZvZsCDtGAcyC8SPH3LCrRpynWz7Xv+Z3PNnXxL9SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTyKRDM5+CMjUBBCHRuaDOM2E77ubMp09mxlCqtKSDsIIAAAggUrMD40UNyet8b085vbP/bYsXGZXTf6zK6/53/n717gY7zrA+E/5/RyJKsSLJ8kRXHiZXIduzEBjmOib1fkmOHJYTuts3ZQpsDOYeQAnbYstn2Y3GANpSm3GkPu92FLaV8sAvb9OuN0pa22R74FtoCZTmBJlwWcO5xEidxHDuOnVia+eaZIFkzI8eyNJLm8nvOmcz7PO/7PpffO1ZGmv/8X0GsL6plZzML3Lh+RfzRPQfLlpjpWBzpUXju2bJ2lakFCsePxtiD34/s0rMj2z9YOihlbU0ZWze/bHvs/Jmfi86urqlP1kqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFZCAhqnQWeUxtDoD03dl0hMp2TZ5uyeL36/KWTm2wTIECAAIG6FXjmK1+M9FAIEDi9QMrGv2VZd9z55NGyg1OA5tgj+8raVF5EoJCP/JMPR+HooWg7eziirb108F3/9PW4p5ix9aevf0OcO7zuRTqwiwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmcukD3zU5xBoLEEChF7K2f86qGl0dFWnrm18hh1AgQIECBAgACBxhS4aeOKqolnFvdOBGZW7dRwSoGUtXX0/u9G4ciTE8ccPXw4bv/Yf4yv/NUXYvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxWQFDrbAWdX9cCbcNbr43IvHDP1J/MNAWzvqF4W1qFAAECBAgQIECgOQV2nt0bKTN/WclkI7tkoKxJZZoC+bEYe+y+GNv/o4ixExMnfeNLd8Rnfvv9sf/+eyfabBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA2AoJaZ6Pn3EYQqMrSOmWQQyOsxBwJECBAgAABAgSmJZC+xLRnY3UAa7Z3WfH7Tn4FmhbiFAcVnj38QtbW4vN4OXjgsfjcf/qtSAGuCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGYr4BPd2Qo6v24F2tdtG8lkYnvlBKe6HW3lMeoECBAgQIAAAQKNLfDT5/ZHCm4tK23tke3pL2tSOUOBlLW1mLE1f+C+iOL2ePnKX32hFNyaglwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIzFRAUOtM5ZxX9wL5fP7mykletKQztizrrmxWJ0CAAAECBAgQaDKBga5cvPr8pVWryvRVZ3CtOkjDaQXyh598IWvrsSMTx+6//974/Q/eFl/68z+Jo4dPZnOdOMAGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIETiMgqPU0QHY3qMDQtsFMpnBd5exv2riyskmdAAECBAgQIECgSQWuX7u8amWZjsWR6eqpatcwA4GxEzH28A8j/+TDEYX8RAff+sqX41Mfui2+8aU7YvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxOQFDr6YTsb0iBXC5/Q0Smc/LkU7auV67um9xkmwABAgQIECBAoIkFUpb+7QNnVa0wu0S21iqUWTTkn3o0xh74bhSOHpro5fixY/GVv/pCKbj1R3f980S7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLyYgKDWF9OxrzEFhnYWg1kLN1dO/rXDy6OjLVPZrE6AAAECBAgQINDEAjeunyJba/eSiLb2Jl71/C+tcOL5GHtkXylza+G5Zycm8PTBJ+Pzn/5E3P6x/xgH9hczuioECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBB4EQFBrS+CY1djCrS1PXNtMUvr4OTZp2DW1w0vm9xkmwABAgQIECBAoAUEUqb+c89aVLXSbH/Z28Wq/RpmJlA4diTGHvx+5B9/IGLsxEQnD+77UXzmt94fd/zx7XH08OGJdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYLCCodbKG7aYQKBQKeysXkoIZBrpylc3qBAgQIECAAAECLSDwhimytWZ7+ovfg/Lr0Fxd/vzTj8fo/d+N/FOPFm+ikJ8Y5jtf+/v41Idui3/42y/G6InRiXYbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQSAI+xfU6aCqB9nXbRrLZGKlc1E0bV1Y2qRMgQIAAAQIECLSIwGuGlkbK3F9W2toj2yuTf5lJrSv5scg/+XCMPfDdKDxzcKL348eOxT/e8cX4xPtuje/f+a2JdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAQ1Oo10GQCY1VZWrcs646LlnQ22TothwABAgQIECBAYLoCvYva4vrh5VWHZ3qr26oO0jBrgcKJ52Ps0Xtj7KEfROH40Yn+jh4+HH/52f8nPveffiv233/vRLsNAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRaV0BQa+te++Zb+dC2wUIhrq1c2E0bV1Q2qRMgQIAAAQIECLSYwKsvWFq14kzH4sgs7q1q1zA3AimgNQW2pgDXGDsxMUgKaE2BrSnANQW6KgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQItK6AoNbWvfZNt/JcrrAnIlOWknWgKxc7zxao0HQX24IIECBAgAABAmcokDL3T/W+MNvnC1BnSDnrwwvPHIzR++6O/MFHIgr5if6+f+e34hPvuzX+4W+/GMePHZtot0GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQOsICGptnWvd3Csd2lkMZs3vrlzkG9aviI62TGWzOgECBAgQIECAQAsKvHZ4imyt3Usi076oBTUWeMnFYNb8wf0vBLc+/fjEZEZPjMY/3vHF+Ph73hF3/PHtcfipJyf22SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPkFcs2/RCtsBYH29sPXFQqZwclrTcGsrx6qDlyYfIxtAgQIECBAgACB1hF45eq+GO7piH1HnitbdKZvZRSeeLCsTWWeBMZORP7xB6Jw+InILl8dma6e0sApuPU7X/v70mPjlq3xsquujoFV58zTpAxDgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBCCcjUulDyxq2pwNhY5ubKDn9mTX8MdInbrnRRJ0CAAAECBAi0ssDr1i2rWn62t9iW8atRFcw8NhSeezbGHv5hjD2yLwrHjpSN/P07vxWf+a33x+0f+49x7w++V7ZPhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB5hIQ8ddc17MlV9O+9pLthYiRysXfuH5FZZM6AQIECBAgQIBAiwtcP7w8PvidR+K5seI7yPGSbYts3/LIHzow3uJ5gQQKRw/FWPGR6eyObP9gZLqXTMzkwX0/ivRIGVu3XrkrNm3bPrHPBgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSEgHVFzXMeWXkUxHGFvJcD2gbPioiWdlc3qBAgQIECAAAECLS7Q0ZaJFNhaWTK9vhBVabKQ9cLxo6WsrWP33xWFI09GFPIT0zmw/+H469s/Gx97zzvjW1/5chw/dmxinw0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpbQFBrY18/sx/aNlhEuKYS4sb11YEKlceoEyBAgAABAgQItKbA69Ytq1p4ZlFnWVbQqgM0LIhA4cTzMfbYfTF6392Rf+rRiPzYxDyOHj4cX/rzP4nfe++t8ZW/+kKkukKAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGMLCGpt7OvX8rPP5QrFLK2ZspSsA125eOXqvpa3AUCAAAECBAgQIDC1wHBPx5TvF7O91cGuU/egdd4Fxk5E/smHi8Gtd0X+iQcjivXxkjK1fuNLd8Qn3ndr/M0ffjYOHnhsfJdnAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTCDXYPM1XQInBYZ2FoNZD19XDGo92Vbc2rNxoKyuQoAAAQIECBAgQKBS4LXDS+NvH3q6rDnTvSRSxtbC88fL2lXqSKCYqTV/6EDpkYKQM0sGS9cszXD0xGjc9U9fLz3WbX5JbL1iV5w7vK6OJm8qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAicTkBQ6+mE7K9bgfb2IzcUCpnByRPsaMvET5/bP7nJNgECBAgQIECAAIEqgZ1n98ZFSzrje4fKA1gzvSuikDKBKnUvkD/8ZPE7bk9GCkbOLhmITFfPxJx/dNc/R3oMrDonNm3bHhtGtkZ3b+/EfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNSnQLY+p2VWBE4vMDYWuyuPun54eQx0idWudFEnQIAAAQIECBCoFnj1BUurGrN9y4s3AvBrUhVMHTcUjh6KsYd/GGMP/SDS9uRyYP/D8aU//5P42HveGZ//9Cfi+3d+q5TRdfIxtgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqB8B0X/1cy3M5AwEcmsv3RlRGKk8ZarAhMpj1AkQIECAAAECBAgkgfSFqI/e/Vgcfn7sJEgxoDUFtqZb3CuNJVA4fjTGHtkXmfZFkV26KjJnFe/gMClAeTx7a2dXV6zb/NK4+NLL4tzhdY21SLMlQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OQCglqb/AI37/LyNxc/oS5b3vgtZMsaVQgQIECAAAECBAicQqCjLROvGVoav//Dx8uOyC5ZKai1TKSxKoUTz8fYY/dFPP5gZHuWRqZnWWQ6uycWcfzYsbjrn75eevQtXRYbRrbGyL/4v6K3f9nEMTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFgYAUGtC+Nu1NkIDG0bjMhfW9nFa4erbx9beYw6AQIECBAgQIAAgckCN164vCqoNXKLItO9pOpW9pPPs90AAvmxyD9dDFguPlL21kzP8sj2FgNXi9d3vDx98Mn4xpfuKD1WrTk/Nm27LC4sBrmmbK4KAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLzLyCodf7NjThLgVyusLeyi3PPWhSvXN1X2axOgAABAgQIECBA4EUFVne/8D7ybx96uuy4bN+KGDt6qKxNpXEFUvbWwsH9kS8+UtbWlL01ZXGNbNvEovbff2+kxx1/fHts3LI1hi9+Sel54gAbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjMuYCg1jknNkBNBYZ2dkYcuaGyzzesX17ZpE6AAAECBAgQIEBgWgI3Ft9LVga1Zhb3RqZjcRSee3ZafTiocQQKx49GeuQffyAyZy2N7Fl9pefJK/j+nd+K9Pi7P7m9lLk1ZXBNmVwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTmVkBQ69z66r3G03v0xAAAQABJREFUAu25w3sKkVkyuduOtky8ZqiYZUkhQIAAAQIECBAgMAOB7QNnxUVLOuN7h46XnZ2yeQpqLSNpukrhmYMxVnxE9oFS5tZ0zVMm1/Fy/Nix+M7X/r70WDqwMjaMbI11m18aA6vOGT/EMwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECNRQQ1FpDTF3NvcBYPvP6bLZ8nOuHl0fvopO3DS3fq0aAAAECBAgQIEDg9ALXr10e7/zfD5UdmO1bHvmnHo0YO1HWrtKEAvmxyD/9eETxkWlfFJme5ZHtXRaRWzSx2IMHHot/vOOLpUd3b2+svfglcf6FF8X5Gy6KXLtfrSegbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYhYBP3maB59T5FcgNX3JNZGKkctTXrSt+2KwQIECAAAECBAgQmIXAq89fGh/97qNx4NjoyV4y2WL2zv7IHzpwss1W0wsUTjwfhYP7I198pKytKXtrtqd4Z4jsyS/SHT18eCKDawpoTYGtKcA1BbqmgFeFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGZCQhqnZmbsxZCIBO7K4d95eq+GO7pqGxWJ0CAAAECBAgQIHBGAh1tmfjpc/vj939YzNY5qWT7BwW1TvJotc3C8aORHvnHH4jMWUsj290bme4lZQGuoydG40d3/XPpcccf3x4Dq86JtZteWgx03Rir1pzfamTWS4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBWAoJaZ8Xn5PkSWLR+64Z8Pq6tHO/nhvorm9QJECBAgAABAgQIzEhgz8aB+Oy+J+K5scLJ89vaS0GMhaOHTrbZakmBwjMHY6z4SCXT1VN6XWQWF4NcF3WWeRzY/3Ckxz/e8cUYz+J67gXr4ty160sBr2UHqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCYgqLWMQ6VeBfL5TDFL66TgguJEU4bWlKlVIUCAAAECBAgQIFALgYGuXOw8uzf+9qGny7rLLhmIMUGtZSatXikcOxLpkUoKai0FtxYzuWY6u8toJmdxTTs6u7qKGVwvinOHi0GuxcfSgZVlx6sQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaHWBTKsDWH8DCAzt7MzlDj9V/Li4LAXSrZesil9cv6IBFmCKBAgQIECAAAECjSJw55NH49r/+eOq6Y49+P0oPPdsVbsGAmUCxcy+2e5i9taUwbWYzTWK9Rcr3b29cUExyPXs84ZixapzYtWa81/s8Kbf9+H/+5cm1njiR/97YtsGAQIECBAgMHcCmYyPCOZOt756LhTKk0bU1+zMhgABAgQINI9AK72/8v6ieV63VkKAAAEC9S3QKu8vcmu3TlwImVonKGzUq0B77vCeQkVAa++itrh+eHm9Ttm8CBAgQIAAAQIEGlRgy7LuuGhJZ3zv0PGyFWR6l0fh8QfK2lQIVAmMnYj84Scj0qNYJrK4FgNcS0Gu2bayU44ePhx3/dPXS4/xHSmD68A5q0uBrucMDUVv/7LxXZ4JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINL2AoNamv8SNv8Di98d3V67iNUNLo6NNFoFKF3UCBAgQIECAAIHZC/zihSvi//7Gg2UdZXuXRf7gIxHFoEWFwHQFCs8fj/SIQwdKp2Q6FheDW88qZnLtKz1HJlvV1YP7fhTpMV46u7ri7GIG15TNddWaoVi2cqVA13EczwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAk0nIKi16S5pcy2obXjrtcUVbahc1Y0XytJaaaJOgAABAgQIECBQG4GfPq8/PvjPj8SBY6MnOywGH5YCW5969GSbLQJnKFB47tlIj4kg187uUgbXTEdX8d4UZ0XkFlX1ePzYsbj3B98rPcZ35tpzxSDX82PpwMpYVnykzK4rVq2OFACrECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGhkAUGtjXz1WmDumUzh9cWbdpat9JWr+2J1d/WHvWUHqRAgQIAAAQIECBCYoUC6I8Brh5fHR+8uD2DNLhmIfMq4WcjPsGenESgXKBw/GukxUbJtxeDWYqBrMcD1hefuiGJbZRk9MVrK5jo5o2s6pru3N5auWFnM6Hp+Mcj1nDirWBfsWqmnToAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUM8Cglrr+eq0+Nza120bKRTyKVNrWXnt8NKyugoBAgQIECBAgACBWgu8bnhZfPz7j8VzY4WTXbe1F28b3xuFo4dOttkiUEuB/FgUnj1ceox3m2kvfqGv4ydBrimja8fiKQNd0/FHDx8uPSqDXdO+FOjaUczkevZ5Q6WMrim7a9/SpdHbvyztbojytr/7ckPM80wn+ZF/uavqFGutImm4BtfVv9eGe9FWTNhruHVewxWXXrXFBLznaPwL7ud16/y89u/Vv9dGEmj1n02NdK3mYq77f+0X56LbBe9z1W2/XzUHa60iabiGVr+u3l803Eu2asKt/v/cVnoNV138FmgQ1NoCF7lRl1gopCyt5eWiJZ2x8+ze8kY1AgQIECBAgAABAjUWGOjKRbpDwBfuLw9gzfYPxpig1hpr6+7FBAonno84cTAKzxw8eVjK6FoMbk3ZXCO3KDKLOiPT1XNy/xRb+++/t9R67w++V7V36cDK6O7pLWV27S9mek1lYNXq6FzcVQyEXVzcPqfqHA0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE5kJAUOtcqOpz9gJDI0si8nsiMmV9Xb92eVldhQABAgQIECBAgMBcCdy0cWVVUGvplvDFYMLCc8/O1bD6JXB6gZTR9diR0mPywaWsru3FANfOYmbXlOE1Bby+SGbX8XMPHngs0uN0ZTz4NR03eXv8vPPWrhvfLD0LiC3jUCFAgAABAgQIECBAgAABAgQIECBAgAABAgQIEJiGgKDWaSA5ZP4F2nPZGwrFj2Inj5yyZb36/KWTm2wTIECAAAECBAgQmDOBdJeA9PjeoeNlY2T7V8bYoy9kvSzboUJggQVeyOr6fBSePVw1k1JW10z2hYyupUyvXRFt7aUsr1UHn6JhcvDrg/t+VHXUP95R1aSBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBkJCGo9Iy4Hz5dAvpC5OVOepDV++tz+6GiraJyvCRmHAAECBAgQIECgJQVStta3fu3+srVnuos3FSgGA8bYibJ2FQL1LFA4frQ0vZThtbJkFhW/T5he08WS6ep5YXdbbiLgdTwg9oUd/kuAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBg7gQEtc6drZ5nKNA2vPXaYkDrUOXpezYOVDapEyBAgAABAgQIEJhTgVeu7ot0x4ADx0ZPjlPMdpntWxH5g/tPttki0MAChedTNuIXMhJPFfQ6sbRShtfFE9XJwbDjjZmus8Y3X3iuOKd8pxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBcgFBreUeanUgkMnkb47Ils1kPJigrFGFAAECBAgQIECAwBwLpDsFvGH9ivjgdx4pGynbtzzyTz0aUciXtasQaGqB/FhMDnqdvF2LdU8VJFuLfvVBgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQOALlkYONM28zbVKB9nXbRooBrTsrl3fj+uWVTeoECBAgQIAAAQIE5kXg1UNLIwW3lpXirdoz3UvKmlQIEJidQMoYmwJlax0sO7tZOZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA+BQS1zqe2sU4rkM/nd1cedNGSztg+UHELy8qD1AkQIECAAAECBAjMkcBAVy5+Zk1/Ve/Z/pVVbRoIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOYCglpnbufMWgsMbRvMZAo3VHb7ixeuqGxSJ0CAAAECBAgQIDCvAjeur35PmulYHOmhECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBtBAS11sZRLzUQaM+NXReR6ZzcVcqK9dPnVWfFmnyMbQIECBAgQIAAAQJzLZDuHrBlWXfVMNmlZ1e1aSBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBmQkIap2Zm7PmQKAQsbey21cPLY2OtkxlszoBAgQIECBAgACBeRe4aeMU2VoX90a0tc/7XAxIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBZhQQ1NqMV7UB19Q2vPXaYpbWwclTT8Gsb5jiNq+Tj7FNgAABAgQIECBAYL4EXrm6L9KdBMpKJhvZJQNlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgMDMBQa0zc3NW7QWqsrTuPLu3Omig9uPqkQABAgQIECBAgMC0BfZsrA5gzfYuK34/y69W00Z0IAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBE4hUJFm6BRHaSYwhwLt67aNFAr57ZVD/O1DT8ea279T2axOgAABAgQIECBAoL4E2toj29Mf+cNP1te8zIYAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQINJiCdUINdsGacbj6fv7kZ12VNBAgQIECAAAECrSOQ6avO4No6q7dSAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EZAUGttHPUyU4GhbYOZTOG6mZ7uPAIECBAgQIAAAQL1IJDpWByZrp56mIo5ECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGEFBLU27KVrjonncvkbIjKdzbEaqyBAgAABAgQIEGhlgewS2Vpb+fpbOwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECsxcQ1Dp7Qz3MVGBoZzGYtXDzTE93HgECBAgQIECAAIF6Esh0L4loa6+nKZkLAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEGkog11CzNdnmErjv/zs+GnF2cy3KaggQIHB6gUKh0Fl8PJXJyFR9ei1HECBAoP4FtmzZEt/+9rfrf6JmSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToXkKm1zi+Q6REgQIBAUwrsEdDalNfVoggQaFGBm29284EWvfSWTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAjQUEtdYYVHcECBAgQOB0AsUsrbtPd4z9BAgQINA4Atddd10MDg42zoTNlAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUqIKi1Ti+MaREgQIBAcwoUA1qvLWZp3dCcq7MqAgQItKZAZ2dn3HDDDa25eKsmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBADQUEtdYQU1cECBAgQOB0ArK0nk7IfgIECDSmwM033xwpuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgZkLCGqduZ0zCRAgQIDAGQkUA1pHillarzmjkxxMgAABAg0hMDg4GNdc40d8Q1wskyRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoG4FBLXW7aUxMQIECBBoNoF8Pv/6ZluT9RAgQIDASYG9e/eerNgiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTMWyJ3xGU4gUCOB9nXbRgqF/J016k43BAgQqHuBjgtfVvdzNEECBAgQmJ1Abu3Wsg7GHtkXhaOHytpUCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBqAZlap3bROg8Cxdtwy1g4D86GIECAAAECBAgQWDiB7NKzF25wIxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKDBBAS1NtgFa5rpDo0sicjvaZr1WAgBAgQIECBAgACBKQQyHYsjPRQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQOL2AoNbTGzliDgTac9kbIjKdc9C1LgkQIECAAAECBAjUlUC2f2VdzcdkCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUK8Cglrr9co0+bwKEXubfImWR4AAAQIECBAgQKAkkOku3qSgrZ0GAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxGIHea/XYTqLlA2/DWa4udDlZ2fON73xE9S5dWNqsTIECAAAECBAgQaBiB0ROj8elfe18cffrIyTlnspHtWxH5g/tPttkiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgSoBmVqrSDTMtUAmk7+5cozhkU0CWitR1AkQIECAAAECBBpOINeei43bL62ad7ZveUQxuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVML+FT11Db2zIFA+7ptIxHZnZVdb778ssomdQIECBAgQIAAAQINKTCy64pIwa1lpa09sj39ZU0qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFAuIKi13ENtjgXy+fzuyiFWrF4Vay7eUNmsToAAAQIECBAgQKAhBbr7eiLdiaCyZPoGKpvUCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGCSgKDWSRg251hgaNtgJlO4oXKUjTuqb89aeYw6AQIECBAgQIAAgUYS2Hr1rqrpZjoWR6azu6pdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi8ICCo1Sth3gRyufwNUfwYf/KAKYvV5it2TG6yTYAAAQIECBAgQKDhBdLdCNKjsmT7Byub1AkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEDgJwKCWr0U5kdgaGcxmLVwc+Vg67eORK49V9msToAAAQIECBAgQKDhBabM1rq4N6KtveHXZgEECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYCwFBrXOhqs8qgba2I9cUs7RWpaWa6oP+qpM1ECBAgAABAgQIEGhAgeGRTZHuTFBWMtmQrbVMRIUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQITAoJaJyhszLHA3sr+p/yQv/IgdQIECBAgQIAAAQINKpDuSDCy64qq2Wd7+ovf9/KrWBWMBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWl7AJ6kt/xKYe4D2ddtGMpnYXjnSyK7LK5vUCRAgQIAAAQIECDSVwMbtl0YKbi0rbe2R7V1W1qRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAhGCWr0K5lwgn8/fXDnIitWrYvX64cpmdQIECBAgQIAAAQJNJdDd1xPrL91StaZM7/KqNg0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBodQFBra3+Cpjr9Q9tG8xkCtdVDrPl5bK0VpqoEyBAgAABAgQINKfAyFXV730zHYsj09XTnAu2KgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxQQFDrDOGcNj2BXK6wJyLTOfnolK1q3dbqbFWTj7FNgAABAgQIECBAoFkE0l0KBs9fU7Wc7JKBqjYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGUBQa2tfPXneu1DO4vBrPndlcNs3H5p5Npzlc3qBAgQIECAAAECBJpW4NKrd1atLdO9JKKtvapdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFpVQFBrq175eVh3W9sz1xaztA5OHioFs47sumJyk20CBAgQIECAAAECTS8wPLIp0h0LKku2v+ztcuVudQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSUgKDWlrrc87vYQqGwt3LENRdvmPLD/Mrj1AkQIECAAAECBAg0m8DWq3dVLSnbu6z4PTC/llXBaCBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoCUFfHrakpd97hfdvm7bSDYbI5UjTXXb1cpj1AkQIECAAAECBAg0o8D6rSOR7lxQVrJtke1bXtakQoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVYVENTaqld+jtddKIy9u3KIFatXxeD5ayqb1QkQIECAAAECBAi0hEB3X09s3H5p1VozPcVsrQoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIhKBWL4LaCwxtGyx2ek1lx1PdbrXyGHUCBAgQIECAAAECzSyw+codVcvLdCyOzOLeqnYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaDUBQa2tdsXnYb253OjNEZnOyUOlrFTDI5smN9kmQIAAAQIECBAg0HIC6e4Fq9cPV60727eiqk0DAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWk1AUGurXfG5Xu/QzmIwa+aGymE2Xb4jcu25ymZ1AgQIECBAgAABAi0nMLLr8qo1Z7qXRGZR2ffCqo7RQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWYXENTa7Fd4ntfX3n74umJQ6+DkYVMw6+Yrtk9usk2AAAECBAgQIECgZQXSHQx6ly2tWn+mV7bWKhQNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0lICg1pa63HO/2LGxzM2Vo6QP7bv7eiqb1QkQIECAAAECBAi0rMDIVdXZWrN9y4vfD/MrWsu+KCycAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHIMCBQK4H2tZdsL0Tma5X9vfZdvxwrVq+qbFYnQIAAAQIECBAg0LICzz17LD55y2/E6InRMoP8Ew9G/tCBsjYVAq0oUCgUWnHZ1kyAAAECBOZdIJPxEcG8oy/QgN5fLRC8YQkQIECg5QRa6f2V9xct9/K2YAIECBBYIIFWeX+RW7t1QlgaoAkKG7MVKH7kuLeyj8Hz1whorURRJ0CAAAECBAgQaHmBjsVdsfmKHVUOmd4VVW0aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0CoCglpb5UrP9TqHtg0W75V6beUwl169s7JJnQABAgQIECBAgACBosDGHZdWOWQWdUame0lVuwYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0goCg1la4yvOwxlyuUJWltbuvJ9ZcvGEeRjcEAQIECBAgQIAAgcYTWLF6VQyPbKqaeLZ3WVWbBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSCgKDWVrjKc73GoZ2dEfnrKocZ2XVF5Npzlc3qBAgQIECAAAECBAj8RGDjZVurLFKm1kzH4qp2DQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGh2ARGHzX6F52F97e1HbigUMoOTh0rBrBu3V99OdfIxtgkQIECAAAECBAi0ukDK1Nq/ciCeeuxAGUWmZ1kUnnu2rE2FQKsLvO3vvtyUBB/5l7uq1mWtVSQN1+C6+vfacC/aigl7DbfOa7ji0qu2mID3HI1/wf28bp2f1/69+vfaSAKt/rOpka7VXMx1/6/94lx0u+B9rrrt96vmYK1VJA3X0OrX1fuLhnvJVk241f+f20qv4aqL3wINMrW2wEWe6yWOjcXuyjFSQGt3X09lszoBAgQIECBAgAABAhUCm6/cXtESke1bXvxPW1W7BgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDSzgKDWZr6687C23PAl12SzMVI51OYrd1Q2qRMgQIAAAQIECBAgMIXA5it2RLrTQVnJZCPbu6ysSYUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLNLiCotdmv8FyvL1OdpXX1+uFYsXrVXI+sfwIECBAgQIAAAQJNIZACWlNga2XJLllZ2aROgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBphYQ1NrUl3duF9e5YWQoInNt5Sgjuy6vbFInQIAAAQIECBAgQOBFBLa8fIr30LlFkele8iJn2UWAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHmEhDU2lzXc15XMzqau7lywO6+nhge2VTZrE6AAAECBAgQIECAwIsI9CxdOuX76OySgRc5yy4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0l4Cg1ua6nvO3mqGdnRGFGyoH3Hr1rsomdQIECBAgQIAAAQIEpiGw+fLLqo7KdPVEpmNxVbsGAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINKOAoNZmvKrzsKb23OE9xWHK7oWaa8/F+q0j8zC6IQgQIECAAAECBAg0n8CaizfEitWrqhaW6V1e1aaBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSggqLUZr+o8rKkQsbtymM1X7Ijuvp7KZnUCBAgQIECAAAECBKYpsHHHpVVHZnuXRbS1V7VrIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQLMJCGpttis6D+tpG956bURmQ+VQU30AX3mMOgECBAgQIECAAAECpxZIXxTrWNxVfkAmG9me/vI2NQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDShgKDWJryoc72kTKbw+soxTnWr1Mrj1AkQIECAAAECBAgQOLVArj0XF22fIltr/+CpT7KHAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECTSIgqLVJLuR8LWPR+q3FDK2ZYqbW8rL58svKG9QIECBAgAABAgQIEJiRwNard1Wf19Yeme4l1e1aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0EQCglqb6GLOx1Ly+czuynF6ly2N4ZFNlc3qBAgQIECAAAECBAjMQKC7r2fK99dZ2VpnoOkUAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaSUBQayNdrYWe69DOzoj8nsppjFx1eWWTOgECBAgQIECAAAECsxAY2VX9HjvT2R2ZjsWz6NWpBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqG8BQa31fX3qanbtucPFgNZMMbD1ZMm152LzFTtONtgiQIAAAQIECBAgQGDWAqvXD8eK1auq+skuGahq00CAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFmERDU2ixXch7WkS9kbq4cJgW0psBWhQABAgQIECBAgACB2gpsvrL6y2OZs/oj2tprO5DeCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCcCglrr5ELU+zTahrdem8nEUOU8N1+5vbJJnQABAgQIECBAgACBGghs3H5pdPf1lPeUyUa2d1l5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSaREBQa5NcyLleRiaT2V05xvDIpuhf6fanlS7qBAgQIECAAAECBGohkO6IkAJbK0t2SfE9eDG4VSFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECzCfgktNmu6Bysp33dtpGIwjWVXW+8bGtlkzoBAgQIECBAgAABAjUUGNl1RaTg1rLS1h6Z7iVlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBoBgFBrc1wFed4DYVC4fWVQ6xYvSpSplaFAAECBAgQIECAAIG5E+ju64k1F2+oGiDbv7KqTQMBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaXUBQa6Nfwbme/9C2wYj8nsphNu6ovg1q5THqBAgQIECAAAECBAjMXuDSq3dWdZLpWBzpoRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCZBAS1NtPVnIO1tOfGrovIdE7uumNxV2y+YsfkJtsECBAgQIAAAQIECMyRwOD5ayLdKaGyyNZaKaJOgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCwhqbfQrOMfzL0TsrRziou2XRq49V9msToAAAQIECBAgQIDAHAlsefnlVT1nupdEtLVXtWsgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAowoIam3UKzcP824b3nptMUvrYOVQU32gXnmMOgECBAgQIECAAAECtRNYt3VLdPf1lHeYyUa2b0V5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaWEBQawNfvLmeeiaTv7lyjOGRTdGzdGllszoBAgQIECBAgAABAnMokO6UsOnyHVUjZPuWF7+H5te6KhgNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0pIBPPxvyss39pNvXbRuJyO6sHGnz5ZdVNqkTIECAAAECBAgQIDAPApuv2B4puLWstLVHtqe/rEmFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECjSogqLVRr9wczzufz++uHGLF6lWx5uINlc3qBAgQIECAAAECBAjMg0B3X0+kOydUlkzfQGWTOgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpSQFBrQ162OZ700LbBTKZwQ+Uom6+svt1p5THqBAjMn8Cavt4Y7l8Suawf5fOnbiQCBAgQILCwAluv3lU1gUzH4sh0dle1ayBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCVTcu7LRpm++cyGQy+VviOLH4pP7TlmhNm6/dHKTbQIE5kigK5eLVT1nFR89MbC4Kwa6u2O8bTpD7nvqUOmwg8eOxwOHD8f+I0fi/qcPT+dUxxAgQIAAAQJ1LpDunpAejz+0v2ym2f7BGHtkX1mbCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpNQFBro12xuZ7v0M5iMOvhm4tBrWUjrd86Erl2L5cyFBUCNRJIAaubBpbHxStWRMq+2rNo0ax6TtlbUxnuj9i2anCirxTsOh7gevfjT8RoPj+xr542btq6pZSBdrZzetvffXm2XTh/BgLp9Zeu4WxLer1+/Ft3zrYb5zeggJ8BDXjRTHneBS77V6+Iv/zdz5SNm1ncG9HWHjF2oqxdhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAo0kIEqxka7WPMy1re3INcWA1pNRcD8Zc6rbnM7DdAxBoGkFJgeyblqxfF7WmYIN0+OK4mjHRkfj248+Fv/7kUdlcZ0XfYMQIECAAIHaCay5eEOkOykcffrIyU4z2UjZWvNPPHiyzRYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTEBQa4NdsHmY7t7KMYZHNpU+NK9sVydA4MwFlnZ1xivOP78sg+qZ9zL7M1JQ7Y7V55QeB44+G197eH8pyPXI88/PvnM9ECBAgAABAnMqkO6gMLLriviHz3+xbJxsT3/kn3w4olCf2djLJqtCgAABAgQIECBAgAABAgQIECBAgEBkMplYvnx5PPnkk5Gvk7ss9vf3R19fX9x3332uEAECBAgQIEBgQQQEtS4Ie30O2r5u20ihkN9eObtLr95Z2aROgMAZCqzqOSuuGloTIysHzvDMuT98oHtx/Oz6tfGv1l4QX3ngwfjyfQ+UMrnO/chGIECAAAECBGYqsHH7pfGNv7ojRk+MnuyirT2yvcsi//TjJ9tsESBAgAABAgQIEFhggY6Ojli/fn0sWrSoKlAjm83GPffcE0899dQCz9LwBAgQIECAAIGFEXjVq14VN954Y/y3//bf4i/+4i+iUCgszER+Murg4GC8/e1vj5e85CVx2223xf/6X/9rQedjcAIECBAgQKA1BQS1tuZ1P8Wqx4pZWjNl+1asXhWD568pa1MhQGD6Ailg9KfWDsemFcunf9ICHZkrfoiQAm+3rTo77rjnvvjaQ8VMbwoBAgQIECBQlwLdfT2x/tIt8b2vfbNsfpne4nsOQa1lJioECBAgQIAAAQILK3DuuefG7bffHitXrozR0ZNfykpZyVJQ65ve9Kb4/Oc/v7CTNDoBAgQIECBAYAEEXvnKV8b73ve+eOlLXxqbN28uBbT+5V/+5YIFtqb3aymg9aabborOzs7o7u6OW265RWDrArw2DEmAAAECBFpdINvqANb/E4GhbYPFL31dW+mx5eWXVzapEyAwTYFtqwbjrdu2NkRA6+Ql9RSzZvzchvXxK5dti+H+JZN32SZAgAABAgTqSGDkqur36pmOxZHp6qmjWZoKAQIECBAgQIBAqwvkcrk4++yzY9myZaXA1hQskR4DAwOlW+12dXW1OpH1EyBAgAABAi0okAJaP/jBD5YCWtPyU2b7D3/4w3HttdeWvvgz3yTp/dnevXtj9+7dpYDWNP727dvjQx/6UOzatWu+p2M8AgQIECBAoMUFBLW2+AtgfPm5XGFPMUtr53g9PafsT+u2bpncZJsAgWkIdBX/UP/GLS+NX7hoY6TtRi2res6Km4o/A1KAa8riqhAgQIAAAQL1JZDuqrB6/XDVpLJLBqraNBAgQIAAAQIECBBYKIF0C90TJ06ccvh8Pn/KfXYQIECAAAECBJpR4FWvelVZQOv4Gi+88MJSEOm/+Tf/Jtra2sab5/x5cHCwFNC6Z8+eWLx4cdl4L3vZy0rBtq94xSvK2lUIECBAgAABAnMpIEppLnUbpe+hncVg1vzuyulu3H5p5NobNyCvcj3qBOZDYMOypfH2f3FZpOdmKTtWnxNvKgbpNnKAbrNcC+sgQIAAAQKVAiO7psjW2l3MtN7WXnmoOgECBAgQIECAAAECBAgQIECAAAECCyiQyWTimmuuife///0TGVorp7N27dpSwOurX/3qeQlsTRla3/72t5cytJ4qg/7WrVvjAx/4QGnu7e3+7lh5zdQJECBAgACB2gsIaq29acP12Nb2zLXFLK2DkyeegllHdl0xuck2AQKnEdi2arCUobVn0aLTHNl4u4f7l5Syti7tKkvo3HgLMWMCBAgQINBkAsMjm0p3WKhcVra/7O195W51AgQIECBAgAABAgQIECBAgAABAgTmUSBbvCviv/7X/zo+8pGPnDKgdXw6F1xwQSnw9ed//ufnNLB1ckBrZYbW8bmMP19yySWlYNsrr7xyvMkzAQIECBAgQGDOBAS1zhlt43RcvP3T3srZnurD8crj1AkQeEEgBbT+wkUbm5pjVc9Z8dZtWyM9KwQIECBAgED9CGy9elfVZLK9y4rfW/PrXhWMBgIECBAgQIAAAQIECBAgQIAAAQILIJCCWnfs2BEXXXTRtEY///zzS4Gtr33tayOXq/3dVVNA6y233BJ79uyJ0wW0jk+4u7s7li5tnrtVjq/LMwECBAgQIFB/Aj7lrL9rMq8zal+3baT4/nmkctCpPhivPEadAIEXBK4aWtP0Aa3j1zplob1p65ZImVsVAgQIECBAoD4E1m8diXSnhbKSbYts3/KyJhUCBAgQIECAAAECBAgQIECAAAECBBZGYHR0NH73d383PvWpT8Xzzz8/rUmsWbMm3vve98b1119f08DWFNC6d+/eePOb3zztgNZ9+/bFb/zGb8Sf/dmfTWvuDiJAgAABAgQIzEZAUOts9Jrg3EJh7N2Vy1ixelWkh0KAwOkFUkDrT6294PQHNtERXcVvg16/+eJIAa4KAQIECBAgsPAC3X09sXH7pVUTyfQUs7UqBAgQIECAAAECBAgQIECAAAECBAjUhcD9998ft956a3z605+O5557blpzOvfcc+O2226LG264Idrb26d1zosdNJ6hdffu3WcU0Pqe97wn/vt//++RgnMVAgQIECBAgMBcCwhqnWvheu5/aNtgcXrXVE5RltZKEXUCUwtsWrG85QJaxyVSQGsKbFUIECBAgACB+hDYfOWOqolkOhZHZnFvVbsGAgQIECBAgAABAgQIECBAgAABAgQWRmD//v3x67/+66WMrdMNbF29enXpnNkGtq5YsSJuueWWM8rQes8995QytH7uc5+LQqGwMGhGJUCAAAECBFpOQFBry13ykwvO5UZvjsh0nmyJSFmehkc2TW6yTYDAFAJLuzrjFy7eOMWe1mka7l8SV19wfuss2EoJECBAgEAdC6Q7Lay5eEPVDLN9K6raNBAgQIAAAQIECBAgQIAAAQIECBAgsHACjzzySKTMp5/4xCemnbH1nHPOKQW23njjjbFoBndTHBgYmFFAawrA/exnPxv5fH7hwIxMgAABAgQItJxAruVWbMEvCAztLAazHr6hkmPT5Tsi1+5lUemiTmCyQC6bjTeOvDS6cvP7b+XgsePx1PHjpakcK97aY/+RZ6K/szNSgG0qa/p6I81tPsvVFwzFvqeeKj4OzeewxiJAgAABAgSmENh8+WVx/3d/ULYn070kMos6o/D8C+8hynaqECBAgAABAgQIECBAgAABAgQIECCwIAKPPfZY3HbbbaXsp7t3746Ojo7TzmPVqlWlwNb29vb4vd/7vWkHxK5cufKMA1r37ds3kaFVQOtpL40DCBAgQIAAgRoLzG9EVo0nr7uZC7S3H76uUMgMTu4hBbNuvmL75CbbBAhMIXBdMUPrQPfiKfbUtunI88/Htx99LO5/+nA8cPhwpKDW6ZQ0t5HiL6cjKwfmZZ7Xb744fvvr34w0X4UAAQIECBBYOIF0x4XeZUvj8JMHyyaR6V0RhSceLGtTIUCAAAECBAgQIECAAAECBAgQIEBgYQUef/zxUmBrChq96aabphXYOjg4GL/6q78a2WKim5Tp9fhPEuKcaiUpoHXv3r3x5je/ORYvnt7nmymgNWVo/YM/+AMZWk8Fq50AAQIECBCYUwFBrXPKW7+dj41lbq5M6Lj+0i3R3ddTv5M2MwJ1ILBpxfJSsOhcTiVlYP3qgw/GnY8eiNEZ3MrjwNFn44577i09UoDrpWcPxpXnnTtnWVx7irc4uWrovPjzH/54Lln0TYAAAQIECExDYOSqy+Mrf/SFsiOzfcsj/+TDEQW3CCuDUSFAgAABAgQIECBAgAABAgQIECCwwAJPPPFE/OZv/mYpePQtb3lLdBbv0ni6kgJV3/Wud5UCW//rf/2vpwxsnUlA67333hvvec97SgGtY2Njp5uK/QQIECBAgACBOREQ1DonrPXdaW7tpTuLn2iPVM4yfQCuECBwaoFcMRL8p9YOn/qAWe45NjoaX/jhj+Kb+x+dZU8nT08Brl/88T3x1Qceil8oZpjdUMzeNhdlx+pz4q+K48wkCHcu5qNPAgQIECDQqgIXbb80/vHzX4zRE6MnCTLZKAW2Hjpwss0WAQIECBAgQIAAAQKRyWRiUfEL2+kWvo16W91nn33WlSRAgAABAgQaXODJJ5+M9773vTFa/KzwrW99a3R1dZ12RQMDA/HOd74z2tra4uMf/3hUvicYD2jdvXv3tDO0poDWW2+9NW6//fYQ0HraS+AAAgQIECBAYA4FBLXOIW79dp2/OSJTNr3B89fEitWrytpUCBAoF0iBmynz6VyUHxRvE/yH3/1+HHn++bnovtTvJ+/8TqRMsz+38cJI2VVrWVLAb/L56gNubVxLV30RIECAAIEzFehY3BWbr9gRd37pq2WnZnpXRAhqLTNRIUCAAAECBAgQINDd3R0333xzXHLJJQ0buPHzP//zLiQBAgQIECDQBAIHDx6MD3zgA6X3JP/+3//7aQW2rlixIt7xjneUMrZ+7GMfi6NHj5YkZhrQ+u53v7sU0JqCaxUCBAgQIECAwEIKCGpdSP2FGHto22BE/trKoS+9emdlkzoBApMEUhDov1p7waSW2m2mgNZPf+eueclyevfjT8T+Z56JPZdsiaVdp799yZms8qqh8wS1ngmYYwkQIECAwBwJbL5ye3VQ66LOyHQvicLRQ3M0qm4JECBAgAABAgQINJ5AytL68pe/PHbt2tV4kzdjAgQIECBAoOkEnnrqqfjQhz5UyiD/y7/8y9PKsLps2bK45ZZbSoGtH/3oR+Oss86Kt7/97fHmN795WucnxJShNQW0/sEf/EEpW2zTwVoQAQIECBAg0HACglob7pLNbsK5XGFvZQ/dfT2x5uINlc3qBAhMEviptcORspHWuuw/8kx87q7vzktA6/jcDx47Hp/+57vipq1boitXu/8NpMDflAk2Bc4qBAgQIECAwMIJ9K8ciOGRTbHv23eXTSLbuyzGBLWWmagQIECAAAECBAi0tkChUIjnnnuutRGsngABAgQIEKgrgUOHDsWHP/zhUsbWX/mVXykFqZ5ugkuXLo23ve1t0d/fHykT/Rve8IbS8+nOS/tTQOutt94af/iHfyigdTpgjiFAgAABAgTmRaD2EVrzMm2DzEhgaGcxLWP+uspzt169K3LttQtsq+xfnUCjC6RgzW2rikmOa1zGg0uPLcAtPFIw7ce/dWfUeuxdQ2tqrKQ7AgQIECBAYCYCGy/bWnVaytSa6Vhc1a6BAAECBAgQIECAAAECBAgQIECAAIH6EXj66afjt3/7t+MjH/lIHDlyZFoTW758efy7f/fv4o1vfOMZBbSmDK0poPXEiRPTGsdBBAgQIECAAIH5EBDUOh/KdTJGe/uRGyIyZZF5KZh1/daROpmhaRCoT4Edq8+p+cRSMOknv/2dSIGtC1VSYOsd99xX0+HX9PXG0q5i/LxCgAABAgQILKhAytSaMrZWlkzPssomdQIECBAgQIAAAQIECBAgQIAAAQIE6kwgBbZ+9KMfjQ984ANx+PDhac2uq6srOjun9zndfffdV8rQevvttwtonZaugwgQIECAAIH5FBDUOp/aCzzW2FjsrpzCxu2XRndfT2WzOgECPxHIZbOxY/WqmnqM5vPx6e/cFQeOPlvTfmfS2dceejiOPP/8TE495Tnn9faecp8dBAgQIECAwPwJbL5ye9Vg2b7lEdm2qnYNBAgQIECAAAECBOZK4Pni354KhcIpu5cV7JQ0dhAgQIAAAQItLpACW3/nd34n3v/+98ehQ4dqpnHvvfcKaK2Zpo4IECBAgACBuRBwz/m5UK3DPnPDl1wTmahKybr5yh11OFtTIlA/AhuWLY2eRYtqOqEUSLrvqdr94jmbyaUA2y/d90D87Pq1s+mm7NyUrfXbjx0oa1M5tUB6fQ10L441fX3R39lR3O6eODi1p/0ps2/KrDteUiByCop+oPjHjAPPPrugGX/H51QPz+m1d+GyFzIQDvcvKU1pVc9Z0ZUrf7uTXvf3P324+Hi6aJieD9c8uHu+PdIa0/rPK76OUjmvuN1eDMpPpb/4rexTZVBOr6PxwPb0c+mF19bR0ustve4UAgQaW2DzFTviG3/1P+O5Z4+dXEgmG9neZZE/5P/VJ1FsESBAgAABAgQIjAt0F/8ucdFFF8Wi4t8jXiwQdfz40z2PFTMtXHDBBaX+TnXspk2b4sEHH4z29vZTHTLt9kwmU5r3j3/84zhw4PTvedPxac0KAQIECBAgQKBeBY4cORL/+T//5xgt/s3+ne98Z/T3989qqpMDWlOfCgECBAgQIECgHgUy9Tgpc6q9QG7tJX8Wkbl2cs+r1w/Hz/3ynslNtgkQqBB445aXRgpsrVVJwXTv/fuvTQSR1arf2fSTstG+6/IdNQve/cGTB+OTd35nxlO6aeuWGA9InHEnxRPf9ndfns3pc3puWt9I8ZbQmwZW1MQ9va7ufvyJuPvA46XnVF+oktaWruFsSwqw/Pi37jxtN+n1u2VwIK4499xIAawzLcnv248+tuB+051/ClJN1ikzcgqIns3aTzVmCnj9P08+Gd985NGyoOpTHV+r9lb4GVArK/0QmI7AV/7oC3Hnl75afujo8zF6313lbWoE6kygFkE0dbYk0yFAgAABAnUpkII6J5fNmzfHX/zFX8Tg4ODk5hlvp/+nZ4u/u+eKX8ZMz1OVlKk1Bb9WzmWqY6fTli/+XeSmm26Kz3zmM6c9vKenJ971rnfF1q1bT3tsvR7wile8YlpT8/5qWkwOIkCAAAECsxao1XuayomkL+Ls3r27FNi67CcJPiqPOV09BbS++93vjv/xP/5H6f3X6Y4/3X7vL04nZD8BAgQIEKiNwFy9v6jN7GrXS27tyb/PlKcuq90Yeqojgc4NI0Ojo+UBrWl6I7sur6NZmgqB+hNIQWO1DGhNK0wBn+NZEetlxSkA8gdPHIxtq2rzYUXKFqlUC6TX0yvOPz82LK999t8U2JmCZNMjlRSg+fViRuD0emvWkjy3n7Mqdqw+pyoT60zWvGnF8kiP9O/ziz/eF9/c/+hMupnzc9K/0/Q6Suuf65IyBafHFeedW3JJQb8ps3O9/Qybawf9E2h0gS0vv7w6qDW3KDLdS6JwtD4yxze6sfkTIECAAAECBJpJIAWeLl68ODo6OuZtWSlDay2ytE6ecAqinU5Jmc9uueWW6RzqGAIECBAgQIDAggocPXo0Pv7xj5eCUVPG1oGBFz4Tmu6kUkDrr/3ar8Xtt99ek4DW6Y7rOAIECBAgQIDATASm95edmfTsnLoRGB3N3RxRKJtPbzHz5PDIprI2FQIEygVqkS20vMeIO+65t7KpLur3HHqqZkGt6TboKeDu4LHjdbG2hZ5EyqB51dCaiYDT+ZjPeIDm/iPPFIMQ749vP3b62+3Nx7xqNUYK3r3u4o2RgnlrXXqKt1f8hYs2ljK//skP/k/c//ThWg8xo/7SNf2Z9evmJZh1qgkmlxTcmoKIU2DrVx94MI65LdFUVNoI1J1Az9IX3vfv+/bdZXPLLhmIMUGtZSYqBAgQIECAAAECxb8iFzOrNvptaFPW15StVSFAgAABAgQINJvAsWPH4m/+5m/iVa96Vbzyla88o+Xddddd8fWvf11A6xmpOZgAAQIECBBYKIHaR4Ms1EqMO7XA0M5iKrfCDZU7R66SpbXSRJ1ApcDFK1ZUNs2qnrJnpiDDeiwP13heq86a+W3g69FnJnNKQYBv3PLS+JXLts1rQOvkuaaA2us3Xxxv33FZ8Rb1jZ9BNwWx/tyG9aU1zUVAa6XdTVu3lAI5J7fP93YKZk2voRteunnBAlonrzm5X33BULz9X1xWs0D4yf3bJkBgbgQ2X35ZVceZrp7IdCyuatdAgAABAgQIECBAgAABAgQIECBAgEB9Cpx99tmxe/fuuOKKK854gi9/+cvjrW99a5xzzjlnfK4TCBAgQIAAAQLzLSBT63yLz/N47bnDewqRWTJ52Fx7Li7afunkJtsECEwhUOtMrfWapTUtPQXbplvVt9co8+VcBxxOcbnqqindIj5l1UxZa+uhpFvIv3Xb1vjm/kfjCz/8UUNm2EzZf1+36eJ5Dc5Nr+OfXb+2NObt3/1+jM5zlpeUMTa9luqxjGe07e/sqtsM1PXoZk4EFkpgzcUbYsXqVfH4Q/vLppDpXR6Fxx8oa1MhQIAAAQIECBAg0EoC7e3t8bKXvSxWrlzZsMv+0z/904adu4kTIECAAAEC0xdI71f+w3/4D/FLv/RLkd7DnGnp7u6OPXv2RK742dUHP/jBePDBB8+0C8cTIECAAAECBOZNoD6ibeZtua03UCFid+WqN1+xIzoWd1U2qxMgMEkgZbWsZUBiChqt1yyt48v+5J3fGd/0PEOB9Jr5hYs3RsquWY8lBUhuWL40Pv2du+L+pw/X4xSnnNN4UG4t//vbmbUAAEAASURBVE1OOdApGkdWDkQKcE1u81XqOaB1skHK2prLZuKLP75ncrNtAgTqUGDjjkvj8T/6QtnMsr3LIn/wkYixE2XtKgTqVWD/r/1ivU5tVvNaddvvV51vrVUkDdfQ6tf1bX/35Ya7ZtOZ8Ef+5a6qw6y1iqThGlr9ulZesEwmUwp0qGxvpHpbW1tkp/nF7Z6envjABz4Ql1/euHc2S9dspsV7jpnK1c95rf6eo5Vew95z1M+/u5nOpNXfc7TSa3imr5EXO29wcLAU0Ppv/+2/nVFA63jfHR0d8eY3v7n0XikFtt5///3ju2r23ErX2lpr9rJZsI78bPb3mwV78dVoYK/h1nkN1+gl01DdCGptqMt1ZpNtG956bfGMDZVnpQ+0FQIEXlzgwmXLXvyAM9y776mnzvAMhzeaQAq4TLerX9VzVl1PPWXYTPNMAZopO2+9lzTfN468tKZB5jNZcwpUToGmf/i978/k9DM6p1ECWscXddXQmtL1+ZMf/HC8yTMBAnUokL7Y9q07vhxHnz5ycnaZbGR7+iN/6MDJNlsECBAgQIAAAQItLZAv3qXk2Wefjeeee64mDoVCoRQ0kTKCnSrQ9MSJEzE2NhazCc6cPNm0htHR0clNp9xO83vmmWdOud8OAgQIECBAgMBCC6SA1r1795YytKb3VLMtKcvrm970ptIXmd73vvfFfffdN9sunU+AAAECBAgQqLnA7N/11HxKOqyVQCZTeH1E+be0x289Wqsx9EOgWQXOK2ZqrWXZ99ShWnanrzoTWNrVGXsu2RLpuRFKyjr6xi0vLQVofnP/o3U75RQo/KbiPOvFNWW6fbz4wd6X7qv9N5fHL0KjBbSOz3vH6nPiu48/0RCB0uNz9kyg1QRy7blYv3Uk7vzSV8uWnu0fjPzTT0QU8mXtKgQIECBAgAABAq0pcM8998RrXvOaWFT8kmkK+JxtScGqF1xwQfyX//Jfor+/f8rufvM3fzP++q//elZZx8Y7ToGxad4//vGPx5s8EyBAgAABAgQaViAFtN5yyy1x00031TSbfgqOvfHGGyNluE+Brfv27WtYIxMnQIAAAQIEmlNAUGtzXtdYtH7rhuIX0lOm1rKy+fLLyuoqBAhMLXBOjbNtCmqd2rkZWlMm0UYKaJ1sngIoT4zl49uP1V+GvhR4+7rNF9dd5turLxiKux9/PA4cfXYyZU22rzjv3EiBs41afmb9uvjxN74Zo8U3IAoBAvUpsPXqXVVBrdHWHpnFvVE46gs49XnVzIoAAQIECBAgML8CR48ejW9+85s1HfTAgQPx/PPPn7LPu+++u+ZjnnIwOwgQIECAAAECDSIwHtD6lre8Zdpf/nnooYciZa0/77zzTrvKFND6+te/vhQse9ttt/lS0GnFHECAAAECBAjMp0B2Pgcz1vwJ5POZ3ZWj9a8ciOGRTZXN6gQIVAik7JApULFWZf+RZ+LYNG95Vqsx9TM/Ainw8oaXbq6bTKIzWfV1F2+MNTXOTDyTeVSek+a0YdnSyuYFr6drnoKBa11Sv1cNnf6PTLUet5b9DXQvjiuLgbkKAQL1K9Dd1zPl7wMpW6tCgAABAgQIECBAYK4EUtbXlEH1VCXdAlchQIAAAQIECBA4KTAwMBB79+4tZWid7nule++9N371V3813vWud8UPf/jDk529yFYKbL3++uvj1ltvjXXr1r3IkXYRIECAAAECBOZXQFDr/HrPz2hDO4v3v87vqRxs85XbK5vUCRCYQiAFZtWy7HvqqVp2p686EqjXgNAzIWqGwNwzWW8tjk0Btymrai1LygBby2D6qeZ2pJgV5+Cx41PtqlnbfKyjZpPVEYEWFRjZdXnVyjOd3ZHpqO37n6pBNBAgQIAAAQIECBAgQIAAAQIECBAgcFqBFNB6yy23lAJa05eDplPuu++++PVf//X4zGc+E5/97Gfj3e9+97QDW7Ppznmve13p/AsvvHA6wzmGAAECBAgQIDDnArk5H8EA8y7Qnju8pxCZYmDryZJrz8XmK3acbLBFgMApBfo7y/75nPK46e7Y95Tb+U7XqpGOS0GNI8UM2M1QUjDl6zZdHL/zzW81w3LmZQ0pq+rXHno4Rou38ZltSYHF21adPdtuys5P8/rm/kfiu48/UcoUff/Th8v2D/cvKQXRXloct5YZcdNaNixfWhz70bLxVAgQqB+B1euHY8XqVfH4Q/vLJpVdMhBjj91X1qZCgAABAgQIECBAoNkFUgbZvr6+Zl+m9REgQIAAAQINIjAe0PqWt7wlOjo6pjXrFNCaglhTMOt4uf3222O0eBfJ2267LTZs2DDefMrnFNh63XXXlbLrp3O+//3vn/JYOwgQIECAAAEC8yEgqHU+lOd5jHwhc3Pl3ZxSQGsKbFUIEDi9wKqes05/0BkcIaj1DLAa5NAUBJoyUjZTSdlHd6w+pxSo2YjrOnD02UiZSMf/veWymVhT/FAqXataZ19OPqnf5PXVBx6cNdfanwSYzrqjn3SQ5vSl+x4oeZyqz3Gnbz92IFKA68+uXxe1+tl38YoVglpPBa+dQJ0IbHn55XHHZ/7fstlkzuqPeOLhiLETZe0qBAgQIECAAAECBJpZ4NixY/GpT30qvvrVrzbzMq2NAAECBAgQaACBFcW/rb/jHe8oZWg9k4DWW2+9NT73uc9FviIJxx//8R+X2lKQ6kUXXXRagfHA1lwuF7/xG78Rd99992nPcQABAgQIECBAYK4ERDnOlewC9ds2vPXaYkDrUOXwm6/cXtmkToDAKQQGurtPsWdmzceK34RUmkvgZy9cF13FX+prXVJgZgoy3H/kSCkgcXJ2zRRwmMYc7u8vBR9uWrG81sPHT629IO4+8PiLBkPWfNBZdHjw2PH4n/feGz944uCLzjkF7G4/55xiNtTBWYxWfWqtsrWmbKm1KMnjk9/+TqTX0ZmUFOD629/4Zun6XzW05kxOnfLYlPk1ZWytRRbbKQfQSIDArAXWbd0S//D5v46jTx852VcmG9neZZF/Sqblkyi2CBAgQIAAAQIEml0gBbV+8pOfbPZlWh8BAgQIECBQ5wIpoPWd73xn7NmzZ9oZWu8tfj6SMrROFdA6vtw//dM/LQW2vuc974mXvOQl482nfE5Z7F/zmtdECmxNfd91112nPNYOAgQIECBAgMBcCtQ+ImcuZ6vv0woU32jujiiUHTc8sin6m+QW2WULUyEwRwK1DFZMmSP/f/buPzaO887z/Lea3RTJVvOHJFI0TYuUKdGkRY5bIQmRF1EjKhuOYU9izWaACHfBRcB6YPmPOd0O5sYTZ5L4kEwOCBY7wO0fmcXOHmaAxEnm4CT2bHwXJZB34klkj/zzLMeSbMmSLFOyflqiflki2VffdlruqmqS3V1PNau73w9AsJ+nq5566lXdbdH88Fu0yhLQqpZJw5+pGnx+zq6suffY8XmxpqavpJ/LVNjU4ODn7eqaJquQ6mv/gXU98qPfhvu2Mur1zOG3864GquFg/Xrz7Fn54oZ+Y4Fkrdaq4WINIvtp+pry2zTQ+revvCr6vdj27DtH7cB0QvS15adpoFWDxJnXqp+52BcBBIIR0Ds49I8Oy0s/f85xgEhzm8x9aH+mpeYc43QQQAABBBBAAAEEEEAAAQQQQAABBBBAIBiBVatWpSu0aqC1rq4ur4McO3ZMtELrk08+6anQ6p7gpz/9qczOzsq3vvWtvIKtuv8f/dEfiQZcn3jiCXn99dfdU9JHAAEEEEAAAQQCF4gEfgQOUDKB2PqRpP0b6PvdBxzcvMk9RB8BBEokcGuWUEiJqEt2mAkDlSyzF3vg7Dn5zm9eXDDQmr195vHB8xfS1TU1iGiyaTVTDWuGtWk4Vb32TxVeSVCt/8auSGqyenKyfbUvKrU24a0hXz+B1sxJPPvOkcxDX9+DqCTsa0HsjAACHoHkxLhouNXRamJixf0H7R1z0kEAAQQQQAABBBBAAAEEEEAAAQQQQACBnAIrV64sONCqFVq/9rWv5RVozRz0n/7pn9LHeeWVVzJDi37fvn27fPOb35SNGzcuui0bIIAAAggggAACpgUItZoWXcL5UqnUl92Hb+3skK4Nfe5h+gggsICAiYBZZvqLN4qvmpiZg+/hEehILPddxTL7bLSS5ffeeFOKreirt3fX6q5PH34ne1rfj7d1r/E9RxATaJD3uy+/WrSXrkmDn0+9dcjY8rSqqZ/qznfarym/7czVa6KBXRNNKwKbmKsuGjOxHOZAAIEABeJNiZw/J0Ra/IX1A1wyUyOAAAIIIIAAAggggAACCCCAAAIIIFAxAhpoffzxx+XRRx+V+vr6vM6rkAqt7gmfffbZ9PEKCbZ+7nOfSwdbh4aG3NPRRwABBBBAAAEEAhVwleYJ9FhMHqRA90i7yOwuEctxlP6xYUefDgIILC6gt8421W7ZocMgmwZwTd5+3uRaNXCpVTUrqY3cYX/UGmoartRAqzr5bc+feC99u/fk6ja/U6X31+qjP7MrwJpYm5EF2ZNo2PL7hrxe++CM6DmaqCaqnxc6176T7xd1qq3xeFH7Ze906Pz57K7vxxq29muzor7O9zqYAAEEghfY9OBn5chrBxwHspY1iH6lPrrmGKeDAAIIIIAAAggggAACCCCAAAIIIIAAAmYEMoHWXbt2lSTQmln1z3/+c7ELZclf//Vfy/BwfjmCBx98UKLRaLo67P79+zNT8R0BBBBAAAEEEAhUgFBroLylmzwWnd2REsuRIFnWUC+D42OlWwRHQgABj8D0Rzc9YyYH+latkC/e229ySmNzaWjz27/eZ2y+pZ5Iq3GOdd5pZBkaFv3+geIrtOZaxA/ffEta6urS4dZczxcypmHpkY47ig5qFnKsfLbVQKtWaL0+M5PP5nlts+fou76Dm5kD3WNXay021Oqnymvm+CYqq2bm0u8nLl3K7hb12MR5FXVgdkIAgYIE9K4O+nX25JRjP63WOnv6XccYHQQQQAABBBBAAAEEEEAAAQQQQAABBBDwL6CB1r/8y78UDbQ2NDTkNeG7774rTzzxhPzgBz+QOZ/FUvbs2ZOe41vf+pZs2rQpr+P/wR/8gViWJV//+tflxRdfzGsfNkIAAQQQQAABBPwImCtH6GcV7OtbICXymHuSe0eHJRojt+x2oY/AYgImKwzemLm12OF4vkwEelqaxVQV31dPnzFexVaDss++c8SY5obWVcbm8jPR9M2bxgOtuh4NyuqXidaxfHnR09RHa4reN7PjxRs3Mg+NfFdzv63ODoHTEECgPASGJic8C7XizSI1Mc84AwgggAACCCCAAAIIIIAAAggggAACCCBQvEB7e7s8/vjj8uijjxYUaNUw6ZNPPikzhop//PKXv5SvfOUr8sILL+R9MpOTk/Ltb39bfv/3fz/vfdgQAQQQQAABBBAoVoBQa7FyIdqvpmdou4jluSd2rl9Qh2jZLAWBqhC4PjNbFedZDSe5obXV2Gk+/957xubKnkhvG28qqLnOYIg3e42FPt5z9JjRCq3Zxzd1HTQIr9Vti2l1Uf+hseu3zFWw1XPQKst+m8k/DvC7FvZHAIGFBXqSAxJvSjg3siISaW5zjtFDAAEEEEAAAQQQQAABBBBAAAEEEEAAAV8Cn/vc5+Thhx+WeDye1zxaoVUDrT/84Q+NBVozB37uuefSFWOff/75zNCi38fHx2XLli3pqq2LbswGCCCAAAIIIICADwHKaPnAC8uuljW3W8SZT875y+mwLJh1IBByAa14aaoiZ6LWf2At5FxVszyt1GqimawQmms9+0+dlocS63I9VdCYvgc02Hrw/IWC9jO5sVrtO/m+ySkdc2kI2FTramqUA2fPFTzdc8ePy0unThW8X/YO1w39ZXb2nDxGAIHqEdA7OwxsHpMXf7bHcdKRxpUyd8H+fErNOcbpIIAAAggggAACCCCAAAIIIIAAAggggEBxAi+//LJoiPSBBx5YNBiqgdZvfOMbgQRaM6v/53/+53Tl2G9961uLVmC9deuW/OQnP5Gf/vSnmd35jgACCCCAAAIIBCZAqDUw2tJMHFs/kkyl5ra6j5ac2Oweoo8AAnkKXP7oppiqMhiN+L+1eJ7LZrMABfT1YOo1se/9qQBXKvLS1Cl5cN3dRoLZ96xcsaSh1qcOHgrUSiuS6peJa7umyFDrmavXRL9oCCCAwFIKDI6Pyst79spMduXnmphEEi0yd/n8Ui6NYyOAAAIIIIAAAggggAACCCCAAAIIIFAxAq+88kq6OuqMXaziD//wD6WmJvfvEY8dOyZPPPGE/OAHPzBeodWN+S//8i/yla98Rb797W/L1q1b3U+n+xpofeqpp9Ih28OHD+fchkEEEEAAAQQQQMCkgLO8p8mZmaskAnNzc4+4D9Ta2SGdvT3uYfoIILAEAibCckuwbA7pEjBVpVWnPXDmrGt2s12t2mmqumpPS4vZxRUw2/FLl0W/gm6mqrWuqK8PeqnMjwACCAQmEG9KiN7pwd2spjb3EH0EEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CFw4MAB+epXvyrPPPNMzsCqBlq1QmspAq2Z09i3b1862Lp3797M0O3vN2/elB//+MfpkC2B1tssPEAAAQQQQACBgAUItQYMHOj03SPtlpXa6T7G4JYx9xB9BBAoQOAGt/IuQKs6Nm1taDByotP2D/76FXSbmr5i5BBtcTPnXcxiZuZKc7vrs9fMVEltqasr5jTZBwEEEAiNwNDkhGct1rIGserinnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB4gTfffDMdbP3JT34iWgU109599135+te/Lk8++aRjPPN8kN9feOGFdBXZX/ziF7cPk12h9dChYO+ud/ugPEAAAQQQQAABBGyBKArlKxCNzu0U+9fM2WegVZb6R4ezh3iMAAIFCmilS1ONoJspyaWdpy1uJtBTisqjKjU1PW0ELBqJSKK2tiRBXCMLLmKSC9evF7GXd5dyq8rc1dQoen2z21JW5s1eB48RQGBpBPRuD+1ru+T0u8cdC4i0tMvsqSOOMToIIIAAAggggAACCCCAAAIIIIAAAggg4E/grbfekq997Wti35lV/viP/1hOnjyZrtD6wx/+MGcFV39Hy2/v/fv3pyu2plIpGR8fT1eTfeKJJ4QKrfn5sRUCCCCAAAIImBMg1GrOsrQzdW+1w6yXd9uhVsdxe4eSEo1xWR0odBBYQoFYjTM0toRL4dA+BNoMVWo1VUF1sVM5cvHDxTbJ+/k7E8vl4PkLeW9fbhtevHHDyJI1/Ksh0VJVmJ1v0T0tzVIfjUpHIpHeZI0dXo3Z69Kqu7pGGgIIILCQwPDkVvlv//kfHJtYDY0iNTGR2U8qRjg2oIMAAggggAACCCCAwCIClmVJLGb/m3KeFnH90eU8mzGMAAIIIIAAAghUnIBWP9XQ6NmzZ+Vf//Vf5R//8R9LXqHVjfryyy+ng62f/exn06FWAq1uIfoIIIAAAgggUAoB0o+lUA7gGDU10/fbgdb27Kk1zJrrtqHZ2/AYAQQWFzB5e3hCZIt7l8MWpqpwmqqgupiZVhu+cP2GmFh3q12ltpJDrSbf743LatPui10fE89rSFWrrXYsX54OsBJaNaHKHAgg0LWhT/TOD1cvZVX8tiKi1Vrnzr0HEAIIIIAAAggggAACRQnM2P+f4tSpU+kqZPo40zTsqoHW64buopKZl+8IIIAAAggggEA5CRw8eFD+6q/+Sq5evbpkFVrdXq+88oq8+eab8tFHH7mfoo8AAggggAACCJREgFBrSZgDOchj7lkzv4R2j9NHAIHCBLSaZnJ1W2E7LbC1Vk3UkGG1NZNhwaW002Co+zbtxa6nlCZagdREqFVfv5XcNPxbDk0D8gNtrbKhdVU6zFrp16UcrglrRKASBfSP5JIT4/Lrnz7rOL1IokXmzr8vkppzjNNBAAEEEEAAAQQQQCAfgffee0927NghtfbPtnp73eymodajR49mD/EYAQQQQAABBBCoOoFLly6F7pwJtIbukrAgBBBAAAEEqkqgspMqFXopY+tHkqnU3Kj79PR2oTQEEPAvcMFwdQi9HfiBs+f8L6zMZljq27CHkauUoVZT55+onf/2gKaOUSnzaPDUZEhWw9Qb29vkvtWrpW/likph4jwQQCDkAv2jw/Liz/bIzK2sP8ipiUmkcaXMXTob8tWzPAQQQAABBBBAAIEwCmgg4o033gjj0lgTAggggAACCCCAAAIIIIAAAggggEAIBSIhXBNLWlRg1lOltbWzQ9rXdi26JxsggMDiAmeuXVt8owK20FBrNbZKqU5bV6aVSk2FK6ORmmp8+RZ1zqYq+urBx9fcJV/dPCZfvLefQGtRV4OdEECgWIF4U0I02OpuVuMq9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAwLgAlVqNkwY8YfdIeyo1u919lI2f2eweoo8AAkUKnLlqOtTaUuRKynu369kV3sr4VEze5t1U0LSUnLGayv/7F70uK+rrSsk677EGWlfJ53vXh2Y98y6UJxBAoKIFBreMyRvPv+A4R2tZg1j1CUldn3aM00EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRMClR+UsWkVgjmikZTu0QsR/JGqymtH9oYgtWxBAQqQ2Bmbk5MBls7EsvFZDCyXJRn5mbLZakVuc6LN24YOa9Eba2ReZhkYQGt8vqFvl7Zed8ggdaFqXgWAQRKIKB3gejs7fEcKdLc5hljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDApQKVWk5pBz9W91Q6zXn7EDrU6jjRg3544GuNSOlDoIOBT4IIdCGyLN/ic5ZPde1qa5cDZc58MGHq0f+q06Jep1rdyhTy88T4j01288ZGReZZ6kpgdNjTRNCxNQyCXgFaJ3fl7g6IBeBoCCCAQFoHkxGY5efiIYzlWvFmsWK2kbt10jNNBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEETAmYSeqYWg3zLChQU3Nlux1obc/eSMOsg+Oj2UM8RgABAwKHzl8wMMsnU2iotRxaazxubJlnrl41NtdSTnTLUBhVK3GWsiVqY0YOd31mxsg8YZ6kcdnSVaPVKs4PJ+8j0BrmFwhrQ6BKBXqSA6J3hHA3q2m1e4g+AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYEyA8p7GKIOfKJVKPWY5i7TKfL9sDn41HAGByhY4cvGi0RPsaWkxOl9Qk7U11Bub+sy1a8bmWsqJTFZYTdTWyvTN0lS3i0ZqjLBdv1X5odZSB44zF0aPu/O+QaNVoTNzz/f9yMUP009duH5DLtoVqbObBpinpqftgG1CHupdl/0UjxFAoEoFhiYn5Ff/9zOOs480rpS58++LpKhA7oChgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACRgQItRphDH6S2LpPjaZEku4j6S+aaQggYF5gavqKaOhLbwtuoultxfVL5w1z62pqMrY89auEZjKEGqspXbXWxBJWH62E617MORRa1fbBdXdLUFWcdS3HL12WE+mvS+nHha6vGAP2QQCByhK4d3RYfvPTZ2Um+w8c7D+aiDStkrkPz1TWyXI2CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQCgFCraG4DIsvwg60PubeqrWzQ/SLhgACwQhoRcMV9e3GJp+8e638/etvGJsviIna4g1Gpj1z9ZqYrHBqZFEhmKTOvtV8qVrMrgJqormreZqYM0xzaPVcU+2GHSTNt+lxxzrvzHfzvLd77YMzcuDMWdHvNAQQQMCvwDK7gvvg+Ji8uvd5x1RWYqUIoVaHCR0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDAjYCbxYmYtzDKfQPeIpurudz9NlVa3CH0EzAocOn/e6IQDravS1VqNTmpwsr6VK8TUbdinroS7Im0hbJc/ulnI5gtu22gwQLnggewn6w0FaCs9nGyqGvNi18P9/LbuNcbebzr3wfMX5H//1a/le2+8SaDVjU0fAQR8CfSPDXv2t5Y1iNXQ6BlnAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPArQKjVr2AJ9o9GZ3aLWI57oMebEtKTHCjB0TkEAtUrcMgOiZkO9Gm11rC2UYNVI00HgpfSTF8DWnnWRGuNx01Ms+gcGk7uSCxfdLt8Njhz9Wo+m5XtNi11jv+8Fn0e+jq5cP1GXvtr4NhkldanDh6Wv3v1dZm+aS6AndeJsBECCFSFgN4ZomtDn+dcI02tnjEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAb8ChFr9Cga9f/dWO21j7XQfJjkxLtFY6W5j7T4+fQSqQeC6fSvxA2fPGT3VsFZr1QCkrs1UO3jugqmpQjHPhRv5hRUXW2xXU2mq2pk8zplrZgK9i9ks1fMr6uuNHDrfQKserKel2ViV1r3Hjsu+k+8bOQcmQQABBOYTGNy8yfOUFW8Wq9bMHwZ4JmcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqVoBQa8gvfSx2eYcdam3PXqaGWftHvbcBzd6GxwggYEYgiLBYGKu1jt3ZYQbMnmVq+krFVYw8a6haacdyM9VTF7tYHYnEYpvk9bzJKrV5HXAJNjJV0baQ4LOGWk20g3Y16WffOWpiKuZAAAEEFhTQO0Q0rlzh2cZqpFqrB4UBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJUCo1Rdf8DvPzlq73UfpHd4o8SYzgSX33PQRQMApcOTih+mQpnPUX08roiZXt/mbxODeK+rrZKTjDmMzmq5uW+zC9BbvptqZa9eNTNUWbxCT65pvUaYqtRZSfXS+tYR9fE2jmeq5hQSfe1pajLC8QIXWBR1L8V5bcAE8iUCFCSS3bfacUaTJrvJu8SOlB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoWoDfQBZNF/yO0XXDWyMRSbqPlOsXyu5t6COAgDmB/adOm5vsdzPt2NAvfTkqnhk/0CITRu0PmYeT9xm7Fboe7sDZs4scdeGnb83NLbxBns/W21WtTbXjly6Zmip963ljk+WYSK+pqUqgxy9dznGE0gxpMDdRWxvowTTQrV8m2oUbH+U9jYnqsGeuXrPfa+fyPmY5bRjGz4By8mOtCAQlMDg+JnrHCEezA63pYKtjkA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQvACh1uLtSrDnnKdKa/vaLmntNHeb8BKcBIdAoOwFXpo6JXobdpNNg4c77xsUE+E2P+v60uAG0eqhppqJyrbTH900spyWOjNhRV3M1PQVuT4zY2Rdwwar4uZakIalTYVBD50/n+sQJRnT98hD96wP9Fimwr+6yCMXL+a1VlMh2qkrV/I6XjluFMbPgHJ0ZM0ImBbQQKsGW93Namx1D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEihYg1Fo0XcA7do+02/fy3O4+yvDkVvcQfQQQCFhAw4y/OvGe8aNoaO/RoY3GKkUWusDJu9fKQKt922CD7XkDTjdmbhlZkelbjx84Y6YqppqbCp3mghrtvDPXcFFjh85fKGo/UzslV7cFGvze0GomiKWfERp8zqfp+95Eu3D9uolpQjlHWD8DQonFohAoscDgllHPEa3aOrHizZ5xBhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFiBMwkK4o5MvssKBCNph5zbxBvSkhPcsA9TB8BBEogsOfoMZm+aaaCaPZyNXi561OlD7Y+sO5umby7O3spvh+rj4nboV+fmfW9Fp2gLR43Mk9mkqMf5leJM7P9Qt/H13Qu9HTRz2nVXa3UaqJp1V1T1Wn9rGfHvf1+dp93Xw0Wmwp15xto1cWYCjTPzKXmPTe/T5hyKXYdYf0MKPZ82A+BShJosf/YINfPI5EmM38kUElWnAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBQnQKi1OLdg9+reat8ze26H+yBDkxPuIfoIIFAigZm5OXn60NuBHE1vR/6nI0PGwogLLVJDj3qsbd1dC21W1HN7j50oaj/3TqaClMN32AWvDbaD5y6Ivg5MtJGOO4yFG7PXM9Fl7rqaCChnr63Yxx2J5XYAe22xu8+737buNfM+V+gTGgDOt8UMVWpN1MbyPWRB2+lnw/iauwrax/TGYf0MMH2ezIdAuQr0bxryLN1qaBRrWYNnnAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgUAFCrYWKlWD7WPTyLhHLkcaKxqLSO5QswdE5BAIIzCfw2gdn5Pily/M97Wtcqzc+vPE++aJdlVKrtwbRxuzb0v/ZphHpamo0Pr1Wad138n0j85q69bgGeDUQaarpOR48f8HIdHq9vzS4QUzdil4XpUHEkQ7HfzqKXquGd187/UHR+5veUasKmwy26mtD3w+m2oGzZ/Oe6rKhis8diUTex8x3w6RdgVGrOC91C+tnwFK7cHwEwiKglVpbOzs8y7ESKz1jDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKFChBqLVSsBNvPzllfdh+mf3RY4k3mAyzu49BHAIGFBZ46eGjhDXw+q6HEb2z5dDrcaiJ8quG9h3rXyeOfHpMv9PUaDVFmn+pTbx0yVsV0+qOb2VP7emy6Iu2eo+/6Wk/2zj0tzfKgoQChzqXX2VTTgLKGeMPUTAVbNUj8pQFzgeKp6SuiX/m2i9dv5LvpgttphWeTTQOtOzb0m5yy6LnC/BlQ9EmxIwIVJtA/Nuw5o0jTKpFIjWecAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAQgWDKARayArZ1CER7PnW/WOIpyTq4ZcyxHR0EEFgaAQ2v7Tl6zK4a2R3YAjR0p+FW/Tpz9ZocuXhRTly+nK4Sq/2FmgZhdX+t4jh2Z4doqDXoprepN3mr+jPXFj7HQs5Hg3raNHRr4pbmmQCjqQqwWl11+uYt2XvseCGn5dhWr/nO+wYdY347+0+d9jtFIPvr+y5RG0u/B4sJ3ep7Q4Obpq6fnqRWcC6k6etQK+HqWvw0rfbbt3KF7+rBug4NRJusXOvnvHTfMH8G+D039kegUgQGx8fkxZ/9Qj66dv2TU7IiEmlcKXMfFva5+MkEPEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRECLWG7VVgySPuJXVt6Mt5i0/3dvQRQKA0AlqtU0NxA612RbKAm4ZS07dKF+et0o9funy7MmpLXZ2YrtqY72lpOE8DoybbBbuSpX6ZOicNtt5jh/8Onb+QDgZPTU/nXG62ac4Nfjeo199kiFRv9z58R7s8c/jtggKK6vP53vXGX4caUC6k8uhCVkE8p+HLZPtqee7YCfnVifduvw8WO5a+j3b+3qDxoPf+qVOLHdrzvL6+TQTOv2gHdL/zmxeLDmzrGrRqrcmQr+dkixgI+2dAEafELghUnEA0FpV77TtJvLr3ece5RZpXE2p1iNBBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFCBQi1FioW4PZ1fcnumRlru/sQg5s3uYfoI4DAEgv86M23pG1kyEgwrZhT0eqcYWhPHTwUyG3qD50/b7RyZH00KhpuzVRuzWX37V/vS4dpcz2XPaahz4N2QFarZJpqGi58eON96XlfskOSWoU0V8hWr7ueywY7UD3ScYfvap/u9WtIWcO1YW9qoGHg8TWdsu/kVLqa8cUbH4eh3WtPX3c7BBtECF1fC8VUjNV9TIRatVrrF/rvKbgSse63rXtN+j3mt2Ks29tUP8yfAabOkXkQKHeBjZ/Z7Am1SrRWrHizpK5+WO6nx/oRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGCJBAi1LhF8rsPOzER3i6QcTzXaoame5IBjjA4CCCy9gN5C/O//vzfkT+1gqwbsqrE9b1fJ3D8VzG3qtapqmG6H7r6+P7bDvH8xtsl4qFSDstlhWX2dadVUDUBqEDHotufosbyCvUGvI9/51WTy7m57c/36uB25+HGQqqelOTMUyHcNAGvV3mKaVgs2tT4N7epcTx96W177YOFbfmt138+uXWsHotuLWXZJ9wn7Z0BJMTgYAiEVSKz4+OeUI68dcKww0twms4RaHSZ0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgf4HqTGLl71O6Lbu31olM73QfMLlts3uIPgIIhETgzNVr8v033kzfij6s1Q6DotLg4NOH3wlq+nTFUg0NhtVVb4++99iJ3wUqA2NIB6ZNhR8XW6W+nn9lB5XLvZXKa9/J99OB42K89p86bVeZvauYXXPuo+HeLw1usKuvdokGoTPBXt24pa5ONMzakVheVgF8rYYc5s+AnBeCQQSqUCA5sVncoVarPiHWsgZJfXStCkU4ZQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CtAqNWvoKH9Y9HLu1JiOcrKRWNRuXd02NARmAYBBIIQ0ODVf3n19XSwtVoqtmqg8+9ffyMIzttzapjt+KXLxqpZ3p7Y4IO9x46n11eqEKXBpXum0iDk9w68mQ4Rep5kwCOgr08NNRfbtPquvr67mhqLnSLnfhpc1VYJr8ly+AzIeREYRKDKBDp7e6S1s0POnpxynLnVuEpSZ4v/nHRMRgcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpKIFJVZxvik02JPOJe3uD4mCxrqHcP00cAgZAJaFXE7778arpCYsiWZnw5Gsb7T/tfLsm5Ph/yqqEautNwr1Y4LfemFYf12oapadA2rO2pg4dk+uZNX8sL++vb18kZ2hkjQ5BMg0DAAoNbxjxHiDSuFKmJecYZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBYTINS6mFAJnq/pGdouYvW5D9U/RpVWtwl9BMIqoIHASg+2alVaPUe/Yb58r+GBs+dEjxnmpsHLv3vt9ZKEfINyeOrg4VA67zl6TDQwHramr8v9U6d9L0vnKdV7qdDFajXmMIS1y+EzoFBbtkegEgX67TtLxJsSzlOzIhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5CBBqzQMp6E0sK/Vl9zF6kgPpW3m6x+kjgEB4BTTY+p3fvBjKgKBfNQ3xaVXSUlfPfObw2yU/ZqFWGgDUsK9+L7f29OF3ZN/J90O57Bszt+R7dgXZMLnqWn705ltGvLTS795j4bs1t57j377yqpy5Fo4KxOXwGWDkBcEkCJSxQDQWld6hpOcMIi3t9t/t8eOmB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYUIDfMi7IE/yTtb1DdoVWy67U6mz9m4acA/QQQKAsBLTy4t+9+ro8+85R0dBauTcNsf7ot2+lv5bifLRapKkQYZDXQgPN6SCgvd5yaHot9bqG+fbu0x/dTFcy/f6BN0PxXsqEPU0Gu9U/TKHizDnq9+u3ZkLxUi6Xz4BQYLEIBJZQYGhywnv0mphYDY3ecUYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBBQSsBZ7jqRIIRNcN/41I6n/NPlTL6jb5n5/437KHeIwAAmUo0BZvkC8NbJCOxPIyXL2kK85qoDQMt0gf6WiXL/TdI9FIcH+L8e1f7/NdFbQ+GpWHN94nXU3hDfFoKPP7dgXUg+cvBPK67GlplkeHNvqeW6vfHrn4YXqesc477evf63vOYidQM12PhpeDaPqa6Vu5Ioip854zO9CqO33x3n7R952f9ue/fM7P7o59y+UzwLFoOghUmcB/+8//IEdeO+A469SNqzJ78qBjjA4C+QqkUql8N2U7BBBAAAEEEPAhYFn8isAHX1ntyr+vyupysVgEEEAAgTIWqKZ/X/HvizJ+obJ0BBBAAIGyEqiWf19E131SBDS4dFBZXfolWmx3sllkbpf76INbRt1D9BFAoAwFtMLg/7n/ZdFbvIchGJovYaY6q1acDcu690+dTocKNXgX5qZ2/8m+5mGt1PvaB2fkO795MbBAq8lrk/3a02qmTx08vCQVW/V9rNc0qECrmv39628EOv9i10XPUSsNZ7+/Lt4I13utXD4DFrPmeQQqWWB4cqvn9Ky6uFjLGjzjDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzCUTne4Lx4AVi0cjOlFh12UeKxqIyOD6WPcRjBBAoYwG9zXvmFuMjHXfIRNcaWVHveNuH5uw0RLjv5FR6vRrODFs7fumyfGffi7JlzV0y0b1GtCpqWNveY8fltQ8+kH9rV5dd6gqcaqTXVqvuBlWdNYjroO+d7KbB1qnpadl536AkamuznwrssQYpnzp4KPAwrZ7rf7FD5A+s6/FdHbUQDD3u3mMn7K/jgZ9jIeuab9ty+gyY7xwYR6CSBdrXdklrZ4ectf8tkd0izW0y+8Gx7CEeI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMKhDcRNO+SK+eJuZS12313Jw20arCVhgAClSWg4TEN5emX3kZ7/K67pCOxPBQnqdUZnzt+QvZPnQp9sO3jEN7xtOOwHRIeu7ND2uLhrACnrlrttqel2Q7hdi1JuFXX8It335VXT58J/bV1vxmyq4ZmnkuHGu1KsxpsVdegmlYufebw2yUNAaeDx799S163w9Bf3NAfeHBXA84/tgO7uZzVNbtSblDOxcxbTp8BxZwf+yBQ7gIbP7NZ9vzDPzpOw1reInLufZHZW45xOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAuAdKTuVRKMFbTM7TdDrR2uw+lvwimIYBAZQto9Uf90oqtG1pbZeSO9pIHXDXIduj8ebua6Bk5cvHDsgPXSrJaAVe/NNSaXL06bdixfHnoKuGqr35piHmbHW69Z+WKwKvM6vE0QK3XtxybBhfna3rtv/vyqzLWeacdDu80GmrWIOeeo8fSdvMdP+hxDZt+xw7uPrDu7vQ5mj7e1PQVefadI4sGdmfmZk0f2uh85fQZYPTEmQyBkAusH9oov/7p/yNXL01/slIrIpHGlTJ38fQnYzxCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE5hEg1DoPTNDDlmU9IpJyHKYnOSCJFSscY3QQQKByBTRYmglmZgKuXU2N0tbQYDzkqgEwDbNp2PHA2bPpx5Uiq1U19xx99/bp6K3pNehaH43ajgn7e036++0N5nlwa3b+IOU8uxQ0rP7fe+PN9D59drBVw60aatZr77dpGPPguQvpoPIhOxSp13upmh5bX2d+Wj5VQjOVj/U9M3rnnbKxvU2ikUjBh9X1HjhzTl46dcr3ugs++Dw76JqeOnjYDp8elYG2VTJ8xx1FV6bVgPCBs+fSrw19jeRjq8ua/uhmaDzmYbo9XC6fAbcXzAMEKlhA7zjRPzosL/38OcdZRprbZO5D+w8tUsH+t9ZxUDoVKTD1tX9XkefV8c3/6jkvztVDUnYD1X5d//yXzv8WlN0FnGfB/+HfTHie4Vw9JGU3UO3XtewuGAs2KsC/OYxyLslk1f5vjmp6DfNvjiV5ixk9aLX/m6OaXsNGXzhlOFk1XWvOtQxfoK4l89nM/79xvSTKrstruHpew2X34jSwYEKtBhALnSK2fiSZSs3d795vcPMm9xB9BBCoEoHbAdes89XKnhpwbYvHcwYzNcynIT4NdLmDalPT03Lhxkd2eHU6HWBdypBj1imV5KFaZDw0zBfGptU49evpw++kl9fT0pwOt7ba17urqWnRJYf5+mp4Vyuplqodv3RZ9OuZw2+nw+AfB5mj0lJXlzb9ONy83PE+SL8vrlyR9+216nrD2vR9m6nsrGHtgbZW0e+Zc9N162sn+zNAz+36zMdVVk9curRoRdb5zj3zGp3v+TCPl8NnQJj9WBsCfgWSE+Py2t7nZeZW1h9Y1MTEijdL6soFv9OzPwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLgAodYluMCpVOrL7sO2dnZI14Y+9zB9BBCoYgEN24U5cFfFl8b4qWtl0yMXjU9bVRNqAPRjR39VYsOKpkFNrU5LQwABBMIuEG9KiN6B4tD+1xxLjbSslllCrQ4TOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4BUo/D693jkYKUSge6RdZG6Xe5f+sWH3EH0EEEAAAQQQQAABBBBAoOwEhia9t2a2ljWIftEQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBhQQItS6kE8BzsejsDhGrLntqrWY0OD6WPcRjBBBAAAEEEEAAAQQQQKAsBfQuFPrlblqtlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggsJECodSGdAJ5LiTzmnrZ3KCnRWNQ9TB8BBBBAAAEEEEAAAQQQKEuBnNVa480iNbGyPB8WjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpREg1Foa5/RRanqGtttVWtvdh8z1C1/3NvQRQAABBBBAAAEEEEAAgXIR6EkOiN6RwtGsiESa2xxDdBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFsAUKt2RoBP7asud3uQ+T8Za97I/oIIIAAAggggAACCCCAQBkJ6J0oBjaPeVYcaVxp/50fP4Z6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIG0AL9NLNELIbZ+JCkS2eo+XHJis3uIPgIIIIAAAggggAACCCBQ9gKD46Oi4VZHq4lJJNHiGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIZAQItWYkAv4+N+et0tra2SGdvT0BH5npEUAAAQQQQAABBBBAAIHSC8SbEtI7vNFzYKupzTPGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIKAChFpL8TroHmm3rNQO96EGt3hvx+nehj4CCCCAAAIIIIAAAgggUK4CyW3eO1NYyxrEqouX6ymxbgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQCFCAUGuAuJmpo9G5nWL/2jbT1+9atah/dDh7iMcIIIAAAggggAACCCCAQEUJ6N0p2td2ec4p0tLuGWMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQINQa9Guge6sdZk3tdh9GA63RWNQ9TB8BBBBAAAEEEEAAAQQQqCiB4cmtnvOxGhpFamKecQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC6BQi1Bnz9a2qm77ertDrKEGmYNTkxHvCRmR4BBBBAAAEEEEAAAQQQWHqBrg196TtVOFZiRYRqrQ4ROggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYAsQag3+ZfCY+xA5f6nr3og+AggggAACCCCAAAIIIFABAvpHfUOTE54ziSRa7L//40dSDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSxAL9BDPDix9aPJC1LRt2HyHX7Tfc29BFAAAEEEEAAAQQQQACBShHoHUqKhlsdrSYmkcaVjiE6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ3QKEWgO9/rOeKq2tnR3SvrYr0KMyOQIIIIAAAggggAACCCAQJoF4U0L6R4c9S7IaV3nGGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoVINQa1LXvHmlPpWS7e/pct910b0MfAQQQQAABBBBAAAEEEKg0gcEtY55TspY1iFWf8IwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SlAqDWg6x6NpnaJWHXZ02t1op7kQPYQjxFAAAEEEEAAAQQQQACBqhDQu1Z09vZ4zjXS3OYZYwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpTgFBrENe9e6sdZp17xD31wOYxicai7mH6CCCAAAIIIIAAAggggEBVCCQnNnvO04o3ixWr9YwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SdAqDWAa15Tc2W7XaW1PXtqDbMOjo9mD/EYAQQQQAABBBBAAAEEEKgqAb1zRePKFZ5ztppWe8YYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB6hMg1BrANU+lUo+5p9Vf3sabEu5h+ggggAACCCCAAAIIIIBAVQkkt3mrtUYaV9p/F8iPp1X1QuBkEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMghwG8Nc6D4GYqt+9RoJCJJ9xxDkxPuIfoIIIAAAggggAACCCCAQNUJ3Ds6LHonC0eL1EikaZVjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSfAKFWw9c8JeKp0tq+tktaOzsMH4npEEAAAQQQQAABBBBAAIHyE1jWUC+D42OehVsJu1orDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEqlqAUKvJy9890m5Pd797yuHJre4h+ggggAACCCCAAAIIIIBA1Qr0jw17zt1a1iBWvNkzzgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSPAKFWg9c6Gp3ZLWLVZU8Zb0pI14a+7CEeI4AAAggggAACCCCAAAJVLaB3ssj1c1KkkWqtVf3C4OQRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqXoBQq6mXQPdWO8xq7XRPl5wYl2gs6h6mjwACCCCAAAIIIIAAAghUtcDg5k2e89dKrVat4+8EPdswgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClStAqNXQtY3FLu+wQ63t2dNpmLV/1HtbzexteIwAAggggAACCCCAAAIIVKNAT3JAWla3eU7damz1jDGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALVIUCo1dB1np21drun6h3eKPGmhHuYPgIIIIAAAggggAACCCCAgC0wuGXU4xBpWmX/vSA/qnpgGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgCgT4TaGBixxdN7w1EpGke6rkts3uIfoIIIAAAggggAACCCCAAAK/ExgcHxO9w4Wj2YHWdLDVMUgHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqQYBQq5GrPOep0trZ2yOtnR1GZmcSBBBAAAEEEEAAAQQQQKASBTTQqsFWd7MaW91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAoECLX6vcjdI+12qX/DAABAAElEQVT2vTG3u6dJTlCl1W1CHwEEEEAAAQQQQAABBBBwCwxuGXUPiVVbJ1a82TPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLYAoVaf1zcaTT3mniLelJCe5IB7mD4CCCCAAAIIIIAAAggggIBLoGV1W86fnyJNVGt1UdFFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoOIFCLX6ucTdW+tE5na4pxianHAP0UcAAQQQQAABBBBAAAEEEJhHYHDzJs8zVkOjWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA5QoQavVxbWPRy7tErPbsKaKxqPQOJbOHeIwAAggggAACCCCAAAIIILCAQNeGPmnt7PBsYSVWesYYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1Orj2s7OWV927z44PibxpoR7mD4CCCCAAAIIIIAAAggggMACAv1jw55nI02rRCI1nnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgMgUItRZ5XaM9n7o/EhFPSdZcv4gt8hDshgACCCCAAAIIIIAAAghUjYD+geCyhnrn+VoRiTRSrdWJQg8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1FrstbXkEfeu890y070dfQQQQAABBBBAAAEEEEAAAadANBaVe0dzVGttaXduSA8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpWgFBrEZe2ri/ZLWJtd+86uHmTe4g+AggggAACCCCAAAIIIIBAngJDkxPeLWtiYsWbveOMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAxQkQai3iks7MRHe7d2tcuUJ6kgPuYfoIIIAAAggggAACCCCAAAJ5CsSbEjl/roo0t+U5A5shgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEA5CxBqLfTqdW+tE0ntdO+W3LbZPUQfAQQQQAABBBBAAAEEEECgQIHkhPdnK6s+IdayhgJnYnMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEECg3AUKtBV6xWPTyLnsXx70vo7GoDI6PFTgTmyOAAAIIIIAAAggggAACCLgFOnt7pLWzwz0sVuMqzxgDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQWQKEWgu8nimRR9y7aKBVg600BBBAAAEEEEAAAQQQQAAB/wKDW7x/NBhpXClSE/M/OTMggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBoBQi1FnBpanqGtotYfe5dBreMuofoI4AAAggggAACCCCAAAIIFCnQPzos8aaEc28rIpFEi3OMHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJQAodYCLqdlpb7s3rwnOSAtq9vcw/QRQAABBBBAAAEEEEAAAQSKFNA7YfQOJT17R1ra7b8z5MdYDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSIAL8NzPNC1vYO2RVaLbtSq7P1bxpyDtBDAAEEEEAAAQQQQAABBBDwLTA0OSEabnW0mphYDY2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA5AoRa87yWc3PWI+5NtUKrVmqlIYAAAggggAACCCCAAAIImBWINyWka4P9t4Wulq7W6hqjiwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSFAqDWf69idbBaZ2+XedHDLqHuIPgIIIIAAAggggAACCCCAgCGB4cmtnpmsurhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggED5CxBqzeMaxqKRnSJWXfamyxrqZXB8LHuIxwgggAACCCCAAAIIIIAAAgYF2td2SWtnh2fGSHObZ4wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofwFCrXlcw7mUtdu92b2jwxKNRd3D9BFAAAEEEEAAAQQQQAABBAwKbPzMZs9s1vIWkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAobwFCrYtcv5qeoe2WJd3uzXL9YtW9DX0EEEAAAQQQQAABBBBAAAF/AuuHNkq8KeGcxIpIpKnVOUYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKXoBQ6yKX0LKsR9yb9CQHJLFihXuYPgIIIIAAAggggAACCCCAgGEBvUPGwOYxz6yRplUidriVhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSPAbwAXuJax9SNJkdT97k0GN29yD9FHAAEEEEAAAQQQQAABBBAISGBwfFQ03OpoNTGx4s2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDeAoRaF7h+c3NzniqtrZ0d0rWhb4G9eAoBBBBAAAEEEEAAAQQQQMCkQLwpIXrHDHeLtKx2D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEyFrDKeO3BLr17pD0anX3Xvp9lXfaBtv2PXxCtEkRDAAEEEEAAAQQQQAABBBAoncDZk1Py5F//jeeAs++9JamPrnnGGShvgVQqVd4nwOoRQAABBBAoEwHL4lcEZXKpfC+Tf1/5JmQCBBBAAAEE8hKopn9f8e+LvF4SbIQAAggggIBvgWr590V03dBtKyq13qZwPohFZ3e4A61aHah/dNi5IT0EEEAAAQQQQAABBBBAAIHABfSuGfrlblRrdYvQRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB8BQi1znPt7Jowj7mf6h1KSjQWdQ/TRwABBBBAAAEEEEAAAQQQKIHApgc/6zmKFW8WqYl5xhlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHyEyDUmuOa1fQMbbertLa7nxqanHAP0UcAAQQQQAABBBBAAAEEECiRQNeGPtE7aDiaFZFIc5tjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSnAKHWHNfNsuZ2u4d7kgPeX566N6KPAAIIIIAAAggggAACCCAQmIDeOSM5Me6ZP9K40v67RH689cAwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECZCUTLbL2BLze2fiSZSs1tdR9oeNIz5N6EPgIIIIAAAggggAACCCCAQMAC/aPD8uLP9sjMrZlPjlQTk0iiReYun/9kjEcVJzD1tX9XceekJ9Txzf/qOS/O1UNSdgPVfl3//JfPld01y2fB/+HfeO/ixLnmIxfubar9uob76rC6oAX4N0fQwsHPX+3/5qim1zD/5gj+/RT0Ear93xzV9BoO+rUU9vmr6VpzrmF/NS6+Pj6b+f83i79Kwr0Fr+HqeQ2H+5UYzOooZeNynZvzVmlt7eyQ9rVdri3pIoAAAggggAACCCCAAAIIlFog3pSQ3uGNnsNaTW2eMQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC8BAi1Zl+v7pF2y0rtyB7Sxxs/s9k9RB8BBBBAAAEEEEAAAQQQQGCJBJLbvD+jWcsaxKqLL9GKOCwCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJgQINSapRiNzu0U+9egWUOiVYDWD3mrAGVvw2MEEEAAAQQQQAABBBBAAIHSCcx3N41IS3vpFsGREEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuACh1gxp91Y7zJranelmvvePDks0Fs10+Y4AAggggAACCCCAAAIIIBACgeHJrZ5VWPFmkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoDwFCrb+7TjU10/fbVVodZX00zJqcGC+PK8kqEUAAAQQQQAABBBBAAIEqEuhJDqTvrOE+Zaq1ukXoI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA+AoRaf3etUin5hvuydW3oy/lLUvd29BFAAAEEEEAAAQQQQAABBEovMDQ54TloJNFi/70iP+p6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAwE+E2ffZFi60eSkYgk3ddr04OfdQ/RRwABBBBAAAEEEEAAAQQQCIlA71BS9A4bjlYTk0jjSscQHQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKA8BQq3p6zT7mPtytXZ2iH7REEAAAQQQQAABBBBAAAEEwikQb0pI/+iwZ3FW4yrPGAMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBB+AUKt3SPtqZRsd1+qXLexdG9DHwEEEEAAAQQQQAABBBBAYGkFBreMeRZgLWsQqz7hGWcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTCLVD1odZoNLVLxKrLvkxa7acnOZA9xGMEEEAAAQQQQAABBBBAAIEQCugdNro29HlWFmlu84wxgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC4Rao7lBr91Y7zDr3iPsSDWwek2gs6h6mjwACCCCAAAIIIIAAAgggEEKBwc2bPKuy4s1ixWo94wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB4Bao61FpTc2W7XaW1PfvyaJh1cHw0e4jHCCCAAAIIIIAAAggggAACIRbQO200rlzhWaHVtNozxgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCIRXoKpDralU6jH3pekd3ijxpoR7mD4CCCCAAAIIIIAAAggggECIBZLbNntWF2lcaf8dY1X/2OsxYQABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBMAtU7W/3Yus+NRqJSNJ9cXL9ItS9DX0EEEAAAQQQQAABBBBAAIFwCdw7Oix65w1Hi9RIpGmVY4gOAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiEV6BqQ60pEU+V1va1XdLa2RHeq8XKEEAAAQQQQAABBBBAAAEEcgosa6iXwfExz3NWwq7WSkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbIQcJWxKYs1+19k90i7yOz97omGJ7e6h+gjgMASCDz+6TFZUV9n/Mj/8cX9MjV9xfi8pZpwfM1d8lDvOqOHO3LxQ/nuy68anZPJEEAAAQQQQACBpRIY3DIqr+593nF4a1mDWPFmSV390DFOBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwidQlZVao9GZ3SKWIzEXb0pI14a+8F0hVoQAAsYEHupdb2yuUk+UqK2Vbd1rSn1YjocAAggggAACCJSVQMvqNulJDnjWHGmkWqsHhQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEQihQfaHW7q12mNXa6b4WyYlxicaqs3Ct24I+ApUq0NPSLEk76FCOTQOtGmylIYAAAggggAACCCws0L9pyLOBVmq1ah1/1+jZhgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEFh6gaoLtcZi0zvtUGt7Nr2GWftHh7OHeIwAAhUq8NA96yUaKa+Pvo7EchnrvLNCrwinhQACCCCAAAIImBXQSq1asdXdrMZW9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAImUB5JbsM4M3OyiPuaTTQGm9KuIfpI4BABQpotdPJu7vL6sweWNdTdkHcsgJmsQgggAACCCBQcQKDW0Y95xRpWmX/fWPV/QjscWAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTALVNVv9KLrhrfaBRqT7gsyuGXMPUQfAQQqWGDLmrtkRX153H52oHWV9K1cUcFXg1NDAAEEEEAAAQTMCwyOj4nekcPR7EBrOtjqGKSDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJhEqiqUKvI3G43fmdvj7R2driH6SOAQAULRO10++d714f+DMtlnaGHZIEIIIAAAgggUHUCGmjVYKu7WY2t7iH6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQIoHqCbV2j7Tb95rc7rZPTmx2D9FHAIEqENAKqPoV5jbWeWfZVJQNsyNrQwABBBBAAIHqFNj4Ge/PelZtnVjx5uoE4awRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAOBqgm1RqOpx9zXI96UkJ7kgHuYPgIIVInAA+t6RKuhhrElamvlwXV3h3FprAkBBBBAAAEEECgLgcSKFTl/3os0Ua21LC4gi0QAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKhKgXCmuUxfiu6tdSJzO9zTDk1OuIfoI4BAFQm0xRtEq6GGsYU5cBtGL9aEAAIIIIAAAgjkEhjcvMkzbDU0irWswTPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILL1AVYRaY9HLu0Ss9mzuaCwq944OZw/xGAEEqlBAq6FqVdQwtY7EchnpcHxkhWl5rAUBBBBAAAEEECgbga4NfdLa2eFZr5VY6RljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEll6gKkKts3PWl93Ug+Njsqyh3j1MHwEEqkwgGonIQ/esD9VZ77i3P1TrYTEIIIAAAggggEA5C/SPef+YMdK0SiRSU86nxdoRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQqEiBig+1Rns+db+dWUu6r16uX2y6t6GPAALVIZBc3SZdTY2hOFmt0KqVWmkIIIAAAggggAACZgT0DxrjTQnnZFZEIo1Ua3Wi0EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBg6QUqPtQqljziZp7vFpTu7egjgED1CHyh754lP1mtGvvAup4lXwcLQAABBBBAAAEEKkkgGotK75Dn7xwl0tJeSafJuSCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQEQIVHWqt60t2i1jb3VdqcPMm9xB9BBCocgGtjjrWeeeSKkze3S2J2tolXQMHRwABBBBAAAEEKlFgaHLCe1o1MbHizd5xRhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYMkEKjrUOjMT3e2WbbFvM96THHAP00cAAQRkKUOlK+rrZMuau7gKCCCAAAIIIIAAAgEIxJsSOX8OjDS3BXA0pkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgWIFKjfU2r21TmRulxtmcMuoe4g+AgggkBbQKqnbutcsicbne9dLNFK5H8lLgspBEUAAAQQQQACBLIHkxOas3scPrfqEWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDA0ghUbIIqFr1sB1otO9j6SYvGojI4PvbJAI8QQAABl8C4XS21I7HcNRpst2/lChloXRXsQZgdAQQQQAABBBCocoHO3h5p7ezwKFiN/DvMg8IAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAkskULGh1pTII25TDbRqsJWGAAIILCTwkF01tZTtgXU9pTwcx0IAAQQQQAABBKpWYONnvNVaI40rRWpiVWvCiSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQJoGKDLXW9Axtt6u09rmhB7eMuofoI4AAAh6BnpZmSa5u84wHMbAUlWGDOA/mRAABBBBAAAEEykFg/dBGiTclnEu1IhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwJIIVGSo1bJSX3Zr9iQHpKVEITX3sekjgED5CTx0z3qJRoL9iEzU1sq27jXlh8OKEUAAAQQQQACBMhXQO3f0jw57Vh9pabf/LjLYf/t5DsoAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgh4BCrut3a1vUN2hVbLrtTqbP2bhpwD9BBAAIEFBDRwOnl39wJb+H9KA616HBoCCCCAAAIIIIBA6QSSE+Oi4VZHq4mJ1dDoGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlF6i4UOvcnPWIm7G1s0O0UisNAQQQKERgy5q7ZEV9XSG75L1tR2K5jNvz0xBAAAEEEEAAAQRKKxBvSkjXBvtvIV0tXa3VNUYXAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1BZodbuZLPI3C43Yf+Y9/aS7m3oI4AAAm6BaCQin+9d7x420n9gXY+ReZgEAQQQQAABBBBAoHCB4cmtnp2surhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDpBFz3XCzdgYM4Uiwa2ZkSy1FWcVlDvQyOjwVxOOZEAIGQC8zMzYkGU/20gdZVol8Hzp7zM41jX52vb+UKx1ghHT2vW/ZXfbSiPsJvE7TFG2RFXZ2saWoSrWir59li93NVzT1z9ZpM37yZ3vfM1av241syNT0tRy5+KNdnZm7PWc0PelqapS0etw2XSZdtOl9TM22Hzp+X45cuz7dZIOOJ2lp7jQ3S09Iy77XWA+s1nZq+kr7mus4L128Esp5STpo5d702eo30WmWamujzmfPOjOtrXl/7Jy5dkjPXrlWEQ+bc/HxXqzvtz4zszw6dL/M5sn/qtPzot2/5OQT7IoBABQm0r+0SvaPH2ZNTjrOKNLfJ7AfHHGN0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgdAIVlYiaS1m7LcuJd+/osERjFXWazhOkhwAC8wrsOXpMHlh397zP5/uEVlU9eP6CaJjUbzNR/XXfyfdlQ2trRYVaNXiZXN0mA22t6RBfvs4a+tMvbTpHdtNg5iH7ui1FSDOzDl3To0MbM92iv3/71/vyDi7qaywdxrYt9Xu+we6M3+Td3ekQ5YEz59J2r31wpuh1L7SjXrfk6tXp6565hgttn3lOzynTNNip13f/qdPpsGtmPOzfC329a7A7c31ynZt+Nmnw/sCZs+nvJj6rch0nnzFTr/k//+Vz+Rwu/Xmhnxv6+bGQUfZk+t+Fbd1d2UNFPVbnv/rvzxv5b0OhC/jivf0y0tFe6G6e7Z86eFj0vyk0BKpVYGhyQv7f/+v7jtO3lreInLPfF7O3HON0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgNAIVk/as6Rnabgdau91sGz+z2T1EHwEEqkTguF3FUCvz+Q3+aOBuy5q7ZO+x477ldJ5cFUfznVgrNGpYV0Ot5d60quK27jWSbF9dUJA13/Puamq0K5M2Siak+fyJk/L8ifcqtoKrBh/H7dfX+JpO34FnnUvfN/ql4b9n3zmSDnbna7/QdhpK1XXmG0BcaC59b+qXzqfh26cPvX27cu9C+y3Fc/q+/+zatdK3aoXx17sGlzXUqV/aNOD6gh1U1DB+pbbM58dY5515B7czFi/ZIWgTodZMgDyo4Hdmvbm+6+vIRNMgNA2BahboSQ5IvCkhVy9Nf8JgRSTS1CpzF5wVXD/ZgEcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBCkQMWEWi3LekQk5bDSX1ImVpj5pb9jYjoIIFA2AhrG29jeVnDoyX2CGozcP3XKV2BOQ1g6j5+299iJsg9larhvomuNHZi8w/d1yddSQ5pqr4HPSgy3akBvwg4I63mabnrr9oc33pcOSP744KG8K8a619G3coVdObknfSt493Mm+ulKv3ZgVt8je46+a2JKI3Oon16fTODUyKSLTJKu1GtbTE1fSYfxlyJ0ucgSfT2tYesv9N1T9OeHVvjVSs4aevfbtEpsqX31NaX/PfHbNPSsfyhBQ6CaBfSOHgObx+TFn+1xMESaVsncxdP2j5f+q/Q7JqaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKLCkQW3aIMNoitH0nav3G8373U5ARVWt0m9BGoNgEN7PzsnaO+T1sr8j10z3pf8+j++d4KPteBNKCmlUbLtem5622/H//0mBRTXdHEeWfCrX/xP2xKV/f0cz1MrMfvHBps08CpugYRaM1en4ZSd31qY8GhVF2X3ipd16lhvCCbXk8NL39pcIOv95qJNWauzZ9tGilpoDV77eqtFn8xtslIgDN77qAfz/fe1Ne6vp7mez7fdb3wvn1rcQNNA8R+11LoMgYMVep+/YMPCj002yNQkQKD46Oi4VZHq4mJFW92DNFBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHSCFREqHVubs6u0upsrZ0d0tnb4xykhwACVSmwz74Nt1bm89u00mKxlf00XOa3UuPTh9/2ewpLtr+6abjPxC2/TZyEBg4f6l0nf2IHLYMOg5pYb0tdnWcaDZn+2eiI6PdSNa2y++hQ/sFWve7/3r7uWlmzlE3fazvvGyx52DBzjnq+Gpwu5bXJHDvX97Z4g/zpyFA6DFoOr3c9h8ZlzkqkGhzVgK6pz5BXT5+RmTn/FRh1XRpsLWW7x8B7Xs9dDWgIICASb0qI3uHD3SItq91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAQC5R9q7R5pt6zUTrfV4JYx9xB9BBCoUgEN7zz7zhEjZ6+3vC6m7bArC/ppB86ekyMXP/QzxZLtqwE/DdRpsC5sraelObRrW8hKq1Vq5VMTtyBf6Di5ntNQpAZbF7qeGvTTNep11yDsUjQNlGqwtZRNbfSYWkk0jOHRTNi22HB+KS2zj5V5zfn9w4DsOU2GOgfaWrOnDvSxvudNXD/9b4qJUG+gJ8vkCJRQYGhywnM0a1mD6BcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1D2odZYdHaHiOVIzGi1nf7R4dJKcjQEEAi1gAZ49Mtv04qr42vuKmgaDZL5ue26Bo+eKdMqrWqlAb8wt0wVSw24lkNTT1PVKos9Xw0ZfmlgQ85KqPrc/2KHWZd6jXpuGmwt1ToywctSV+0s9BpqKFJDyWGpIpvP+nW9JoKc7mO98P777qGi+nrNNchdita3ykxl5tdOf1CK5XIMBMpGQO/y0b62y7PeyIo7PGMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAsAKl+Q18gOeQEnnMPX3vUFKisah7mD4CCFS5gAZDTVSm29a9Ju8KmR9XrOzxJb/v5Pty4foNX3Msxc6Td6+Vh3rXLcWhCz6mBhL/xK58arISZMGLWGCH2O8Cc1r9VEPSYWga1HYHRrUq67/fNOIrxG363Cbv7l6wqqyJ44XxvBc6L/1c0kq/YXkt5Vprnf2e1DbWeWdgr6fjly7LmavXch2+oDH1LFWYeUOr/6qw12dm5OD5CwWdIxsjUA0Cw5NbPadpNTSK1MQ84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBwAmUdaq3pGdpuV2l1pHs0zJrr9pHBETIzAgiUi4AGQ3914j3fy9VKhxpszac9aIcQ/dwifvrmTfnZO0fzOVSottEgmoYJy6lpMG3Hhv7AA5DFmCSW1aYrBLtDpMXMZXIffR9kqhDr912f2iga8AxT0+saZLVgfX+H8bzzuQbqEtYgtwbN1Tboz5GXTp3Oh2rRbQba/IdNFzuIvpZNVNjVKq0m/sBjsfXyPALlJtC1oU/0jh+OZkUk0tzmGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALBCpR1qNWy5na7eXL+MtK9EX0EEKhagT1Hj4kGRf228TV33Q7zzTeXhvs03Omn6XrLLXzU09JcNhVa3ddGQ2M7f2+wZLcSdx9/vv59q1eH0lS9NGirgVa9RXzYAq0ZT711fRCVNNOvl/sGQ3vemfNf6LsGudUnjO2BdT2+/iggn3PaP3XKyGesvr709RBkW2d/tpo4xmsfnAlymcyNQNkK6B9HJifGPeuPNK60/44y2Pe356AMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDFAmX727nY+pGkSGSr+9rlum2kexv6CCBQvQIaEH360NtGAL7Qd8+C83y+d72vANLU9BXZd/L9BY8Rtic11LjTDvmZCF4t1bm1xRtksWtb6rWZqM4Y1Jo1zPcn9q3stbJmmNtwxx3GlxfmQGi+JxvWYG5XU5OMdDiK8ed7SgVtp3/kcPD8hYL2ybWxOgYRnM4+1gb7vea36fkeufih32nYH4GKFegfHRYNtzpaTUwiiRbHEB0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEAhOoGxDrXNz3iqtrZ0d0r62KzgtZkYAgYoQ0Cp1xy9d9n0uWt1wvlt3awjRb8DpqYOHfK+x1BPo7cyDCDdqwHfvseOilWvdXyaupdtJw3SlCNS5j1uOfQ3z6W3iw970/WiykqxWa57v/R92C/f69Pr9TwMb3MNL2p+8u7tkx3/JrtZqog20tZqYZt457llpV4v02bQyLQ0BBOYXiDclpHd4o2cDq6nNM8YAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEI+AqQxPMQYzP2j3SblmzO+z7QDqm3viZzY4+HQQQQGA+AQ2M/tmmkfmeznv8oXvWy4Gz5xy3r9aQn94220/TOYMIa/pZ02L7amiwx749tqmmFQX3Hjshr53+QPTxfG3P0XfTQdqBtlXy2bVrjQUXtVrrwXMXFjz2fGtiPJwCo3d2yLPvHPW9OA2BljJ06XvBeUygIf2xzjtDUx1aP0dL1bRSq37G+A1n62egrlsrgptuHYnlRj7b9I86aAggsLBActtm+e2+/Y6NrGUNYtXFJXXjqmOcDgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIImBdwpkLNzx/IjNF1Q39pT/x/ZE+uVXV2fvNx7+0iszfiMQIIlIXA458eMxLe+e7Lry54m+WHeteJVlv027SCaHZQTufUuYttGoj663/Zt2CY0pSR3oZanfw2DXJ9dfOY71BYZh1ajVVdiwmHqf+D6+5Oh8sy8xX7/fkT78nTh98pdvd0yPfRIW/Ft6InzHNHDeiduXpNLly/IRdv3EjvpaE4De1peDEsTSvwXp+Zuf0+rY/WSEciIW3xBmOvpexz1eP9xxedQaXs5/N9/KXBDYFUadVrpqHDqenp9Ps/O9iu10+rIPe0tNhGy31Xgs51rnotvvObFxf87Mm1X/aYBtuX4jWfvYb5Hu+fOi0/+u1bOZ829d+D773xZvoa5jyIj8HJu9f6DlLr6+s7+170sQp2RaB6BJ76m7+Vk4ePOE44dfVDmT3lHHNsQMe4QCqVMj4nEyKAAAIIIICAV8CyyvJXBN4TYWRRAf59tSgRGyCAAAIIIGBEoJr+fcW/L4y8ZJgEAQQQQACBRQWq5d8Xdib0tkX5VWrt3loncnm3u0pr/+gwgdbbl5UHCCCQj4BWAU22r/YdnttihyhfOnU6HSLU0OC27jX5HH7ebX5lBykXqkw6745L+IQa+K1yqMvXUN337VCYVk4stmkQ9cSlS/LwxvvSIcBi59H9tHLlz+zKnsWEa/0ct5h9NcD6/Hsn05U2///27i82riu/D/iZ4YxEkSYpSiZNcxWLu7RlaUU2dCjBIioZkrdVFwnQGu1DA7RADLSANw+FgaLoPhRFUBTIQxG0QPOwKZICCZAWSQsH3aDZB7dwgHUT70LaSq7lrO2V/9v6Q8mSLVmW1qbIzqFDi3OHosi5Z4YznM8BuJx75t5zz/nM5Vjc+fJ3V5tvDEQe+YVfCAdHR+o5Te5jYmjzpQ8+/CrIutKAMST92Mhw0sq78Txx7XkracbQ5tQDaW9DHa/7P6+8H8Ug991aDOTGFoPose3duSP83T2PLAaAFzsS/E8MzcYq03cLfiY4RcOGiAHgeN0vD3LHcHC0jW3J7W4TeOnDc0n+yGFieKghodYUVbDjf6c0AgTWJjB17HBNqLXQW6lG31UO4fYXaxvEXgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1CXQdqHWrq7r364EWquSOKVyKUwdO1IXgIMIEOhcgRgcjcHWPFVVo14MycUg2O+//MpioDVPuDPOKVYpbacW13/koV1JphwN7xU+W8uJYsDt9069vFgxMs6v3haPjcHWGJRt1RbXGue31tuKx3BkDC2e+/TT3Nf+Wk1isDAGWV9874M1BbZjODFW1Twze3nxNYxh1FQtVqrNc40dG9udaiqL45y5dDk899PX1+Sy/MQx+H22UnU2Bsp/uVKZOFWLYecfnH1z3fNJdf61jhOvqXh9vP7RR5WvK1+FV9d6fHa/WMU0/izlrWQ8MXR/7uB0dm5fVuithOlyttMXL+YcweEEOkdgfGoixDuB3PjketWii4MjYf5y6/6boGqyNggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0qUD9SZ8NWnDlDoS/kT310oeO2X7bBAgQuJdADAMuVUC8176rPR+DTPH20EcqAbM8LYbJVquymWfsRh0bK0bmCfIuzev7b5zNFTZcGmfpewyoxTHztlh5N08wNu/5Vzs+BqB/+8RP1hxoXT5WvPZXqwy6fN88j2P4Mt7S/geVirfrrUAcg4vf+8mpxSrIeeaw/NjRvr7lm+t6HMO18XpP1WK4Nt6ufr0uS+eP7xXxNUxxnS+NGb/nrTa9fKzUj6NVXO9v/p+XFsPZMcy9VI0177lSVDKN7xXxvwcp28Rw/vHi+2GsYqsRILB2genjx2p2LvYNVv6+su1+ha5Zhw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECrSzQVp/IlR85OFXJCkxlQVf6wDG7j20CBAjcTeD7b/zsbk+tq//4N8bWtX925xiujdUp260dqlQyzdvi2htRDTVWB80bWo6B3dQhtbxeMcQXK9E+/9bbuYaKodh6A5X3OnEMXMbwYZxnnnPEtcawd6qW5zbuBx+sKhSfa0oxYBgDrSlC7Oup1LuWSU+NPNByQe7oFIPR/+aHf7H4XpEqyLrc48S580lej4nhoeXD5n786M6ducdIEdjNPQkDEGgzgT3TUyHeEaSqdZVDsT//z2TVmDYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgSyHxKV/VcC27c/m6lNE7VvIZ2jYb4pREgQKBegVgtMVb7m3pguN4hkhz33GuvJxmnmYMM9/YkqVyZN5y52prj2E//4uRqu9zzuRjyi9dIK7QY0v39//dKkqqLX1b6fC/8vT0PJ11aDGz+3umXk1VYPXPp8mI4OVZKzdvirdzrafG4mQQB7nju6P5fztRfoXWl+f/Rqz8Ng93dYfdA/0pPr6svBrkPjj4YYii8FVq8nqJXrDbayBZfl1MXZitrzxdejiH4WLE1jpe3xXEezVkdOM7jzOylvFNxPIGOE+gd6AuTR2bCqRderFp7ob9SPfkTP1NVKE3cOPev/0kTz9a8U43+2/9cczJrrSFpu45Of13/xf/+87Z7zdYy4d/6W7WVvK11LXKtvU+nv66t/eqYXaMF/Juj0cKNH7/T/83RSdewf3M0/uep0Wfo9H9zdNI13OhrqdXH76TX2lpb/Wq89/y8N/v/b+59lbT2Hq7hzrmGW/tKbMzs2qdS69jBkYWF8FSWQZXWrIhtAgTqEfj+6z9LEkCq59zxmBiYbHRgq965rXZcigqCMbAWQ4uNaq99dCX3a5unumfKdcVA6/d+cipJoHVpXq9eShvMia/n7/zfU8kCrUvzTHWNxJBgPS1eA/Uemz1fDE6m/nmPwcWUFW33V4KZrdDiNf8ffnwiudfd1nby/Pm7PbXm/nidpKruHEPK9QaxlyZ8tvKHG3mqJS+N4zuBThTYN3OgZtmFrT2h0JP/DwhqBtZBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCiQH3Jjg3AK5UWvlOp0tq9/NSxes741MTyLo8JECBQl0AM/MRbsW9Ei2G0GKptx5Yi7Pn6Rx81dOnRN2+AMIbKUlQJzbPQGBb93VMvh9S3XY/jphpzKdAav6du565fTzJkrEJaT9s/lO6W8i++/349U7jnMbHqdAyBpmgPJwzx1jufeB3FEHeq63Mt84iGKa7fieE010uKcOzLFy+uZen2IUBgBYF4R5Dd+/fWPFMcSPMzXjOwDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEQnuEWseOVsKs889kX6+pY0dCqVzfbYSzY9kmQIDAD997P0mYab2SL7zzXltW0YvVCPfmvC12tHq1gVVal16LGFTL23YPDOQdou7jl8Kijaq2mMInLu5PXnu9YT9DqeZY7qrvnz4pAtzRKIZOUwVP43jZduL8hWxXXdvx5zsGWzeqxSBrrPjbzEDr0lp/9OG5pYd1f49h1BSVffOGqWOoP1YG1ggQqF9g8vDjNQcXereHQrm+P5KoGUwHAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVAvUlO6qGaPxGV9enT1WqtI4sP1MMs+47VHs7yOX7eEyAAIH1CMTwz5++0dyKqTGk+MI7765nmi2zb7wtdorQ1oeJKkuuBnP11s3Vnl7Tcw/1b9ythlNVj7zbQm9+MXe3p9bV/0XlZ6hRLVW4sZ5KrTu2dYf4laK9lCAwudo8Tp47H+J7WYr2aILQer3zODN7uWEB6XvN6UTFMG+L7415q6wO9/bkvu7OVP5oINX1kNfE8QTaVSDeGaR/hffDwsAD7bok8yZAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ0gJtEWpdWFj4blZxz4HHQu9AX7bbNgECBHIJxADQax9dyTXGeg7+/us/a9vAUYrKpTFs1ajqo8tfhxS3E09VqXP5vFrl8dVbt1plKqvOI8XruOoJ7vJkytf+zOylu5wlTXcM/6Z6DxsfHEwzqTYbJb4npTCcGM53e/KJoXzHR/YYctYIEMgvMPXk4ZpBiv07K3932Ra/TtfMXQcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBVhZo+U/hyg//0qFKsaupLOJKHyxm97FNgACBegRitdZmVLaLtyA/fbF9bws92L21Ht6qY2ZvfFa13aiNFMHZWKmzniqfjVqTcZsnMNTTk+Rk8TpMcS3eazLxvSVFi5VCO7X96IMPcy89VmrNU806b6XcGHA+e/Xj3OswAAECIUwemQnxTiFVrdgVigP3V3XZIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgv0Dmk7n8A6YeYSGEmiqtI1/fHYZ2jaY+lfEIECCwKBCDli9VAk1HHvqFhor80V/9tKHjN3rw4d7e3KcY7bsv/NbfOpZ7nGYN0Ld1S1NCic1aj/OsTSDFtR7P9O4n19Z2wpx7nbt+PecIXx4eA5kxyN2MIG6SCSccJFZqjevOE2SPfjHYWs8fL2wrlULeCsGnL1xsyh9oJGQ3FIGWFYiB1hhsPfXCi1VzLPRVqrV+3L5/oFS1GBsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWkSgtSu1jh0cqTh9O2t14PjRbJdtAgQIJBX4s7NvNTTIdeLchZCqmmLSha9jsE6s4hiDZlrnCQwnqtTarJ/5NxNW5/xaJXjeiS1W646h0LxtYnioriHyVmmNJ60nTFvXZB1EoEMEJp84VLPSwtaeUOjdXtOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB+gVaOtRaKs09G0Khe/nyegf6wvjUxPIujwkQIJBcIAaafnD2zeTjxgEbOXZDJrzCoDHcmaeC4QpDtkWXUGtbvEzJJ7ljW9U/ReoeP1UF1XtNIN52/srNW/fabU3PDyWoyLymE7XgTifOX8g9q1ipNVZsXW97dGel+mOOFl//lOHmHFNxKIFNIzD4wPCKv4cW+/P9vG4aIAshQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgkEhg/Z+yJzrxPYcZO1pJkBSezu43fbx9blOdnbttAgTaSyBWU23E7cKff+udhlaBbYbytspteDuxdeq6O/G1XlpzDLTWE0pcOn7593g7+2a1q7fShFo7OcgdK+vm/W9AvHZisHW9bWJ4/ccsP8fpi/mrzC4fz2MCBL4U2Pf4dA1FrNRa2JLmjx9qBtdBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoAMFWjbUWi5ff7oSah1Z/pqUKiGqPdNTy7s8JkCAQEMFnnvt9aTjx1DbD997P+mYBmueQN+Wrc07mTNtOoFmhlpT4fVtKacaqi3HOZmiWuvw0LrWPj64PeQNE5++OLuuc9qZAIG1CcQ7hsSKrdlW6F/fz3n2eNsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNwRaNlQ6+3b4Zk70/zy0b5DB0LvQF+22zYBAgQaJhAr9b30wYfJxn/up6+Hufn5ZONt1ECD3Z1ZkSxVxc6Net2cd/0C3aX2rEocbz+fopWKXSmGadsxTl+4mPs9O1ZqXc97x6M7d+Tymr3xWYj/7dIIEGiMwOQTh2oGLg5UqisXWvZX65r56iBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQygIt+clb6eEDRyt3a60pyTr5xEwrW5obAQKbVOD5t94JN+fmcq8u3sb6zKXLuccxwMYJdHrVyo2T37gz562YuXzmqYKmy8ds9ONyV0v+U7HRy/5q/Pjen/d9OwZaY7B1rW1iKF/FxxTVZdc6V/sR6ESBySMzYWvPtuqlVwKti8HW6l5bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUIdCiSYX5Z7Nr2bVnPAztGs122yZAgEDDBeItw2OwNW977rXX8w7RMseXKyGtTmydXrWyE1/zdl3z1VtpKrX2bdnSrgTJ5p2iWvfE8NqCqju2dYfh3p5ccz998WKu4x1MgMDqAqVyKXyzcgeRbCtufyDbZZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToEWi+VNHZwpHLvxqeya5k6djjbZZsAAQJNE4ihpjy3c857fNMWusYTfTE/v8Y9N9du1z//+eZakNXcUyBVgHuuQ39m7gncBju8efXjkLfKbqzUGiu23qvtz1mlNcVc7zVHzxMgEMJj31rhd9PSllDo3Y6HAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGcAvf+dD3nCdZ7eKm08N3sMf07d4TxqYlst20CBAg0TSAG0r7/xs/qOl88NkWl17pO7qCkAnPzC0nHM1jrC6QKcK8l0JhSo29LOclwN+fmkozT7oOcPH8h1xLi6x+Drfdqa9lntTFOX5xd7WnPESCQSKBvx8q/nxYH1laVOdE0DEOAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgUwq0Vqh17Gh3CAtPZ6WnnlyhEk52J9sECBBosECsgHfm0uV1n+XPzr4Vrn/++bqPa+UDNtt61mrdqeteq89m3C9lhdW+LVuaRlQqdiU5180vhFojZKy2nbdNDK8edttWKoXdA/11nyZeq2dmL9V9vAMJEFifwOThx2sOKPT0h8LWnpp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrF2gtPZdG79nuXTtOwuhUHXPxlK5FL556EDjT+4MBAgQWIPAcz99PcTg0VpbrPKYIgy11vM1a79UQb/XProS/vydd5s17dznmb3xWe4xDNBeAimDzOWu5v0tUd/W5gVo2+sVrW+28TqI71d7K3cPqLfFKqyxYuvd3j8frYydp6Lv2cofXqS8Xutdp+MIdIrA7v17w9Cu0XDpg3NVSy707QwLP/fvhSoUGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWIbD2ZNY6Bq1319vzhV+rfNZf1SaPzIStPduq+mwQIEBgowRiYOh7Pzm1UadvmfN+cXs+yVzKlTf9WAFXI9AJAt3rCMTn9Yg/Wyna1Vu3UgyzKcY4ee58rlBrDKzGYOvpi7Mrejy6c+eK/WvtjPPTCBBorsDkEzPhhf/6XNVJiwP3h/krlaDr/O2qfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxNIE3iYW3nWnWv0vgvfbvyWf9Udqd9M6q0Zk1sEyBAYKMFYrj3btUG1zO3Zt6OfT3zsi+BJYFrP/986WHu7/1bmlc9dT0VpVdbWIqf89XGb6fnzly6nLsS6sTw0F2XvPf++qvAxtcpzk8jQKC5AvsqdxTpHeirPmmhGIr9+ULq1QPaIkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQINBZAi0Tag2F8EyWfnxqYvGWjtl+2wQIECCw8QKzN/LfWtct0jf+dTSD1QViWDDFtR7PMtTbu/rJEj0bK4KO9t2XZLTZGzeSjLMZBonXwukLF3MtJVZqja9Ptu0e6A95Qv4x0CqAnFW1TaDxAqVyKeyZrvm7zFAcHGn8yZ2BAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCYVqP1UfQMW2r13aiyEwlPZU+97fDrbZZsAAQIEWkTgSoLbkqeqJtkiJKaxSQVSXOuRJgYXm9FSnmf2s/zh9WasuVnnOHH+Qq5TxUBrDLZm26M781V1PHnufHZI2wQINElg+vix2jN1lUOhd3ttvx4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBO4p0BKh1rm50rPZmQ4+MBxipVaNAAECBFpTIFUFx/FBoY/WfIXNakngUqJqpaP3pameujSvu30f7cvcCvtuO96jP2WV2nucqm2ePnf90xC/8rSJ4aGaw1cKutbsdJeO659/Hs5e/fguz+omQKDRAr0DfSv+3lrcPtzoUxufAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwKYU2PhQ69jR7hDmv5PVnXziULbLNgECBAi0kMClRBUcxwcHW2hVpkKgVmD2s5u1nXX0DPf2hGZUJ05VqfXKzVt1rHLzH5K3WmsMsMaKrUttx7buMNpXf+D5zOylEAPIGgECGydw4PjRmpMXtvWFwtaemn4dBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisLnDnE/XV92vYs+XStUqgtVAJtt5ppXIpTB6ZudPhEQECBAi0nMCbiSoDqtTaci+tCWUE3v3kk0xP/ZuNvt5jWDLVOd795Fr9C93ER548dz5XiDS+Rssrsz66c2curdMXZ3Md72ACBPILjHx9dxjaNVozUKH//po+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrC5Qh+jKWgAALvZJREFUWv3pxj+7EMIz2bPEQGsMtmoECBAg0LoCsYpj/IpVBvO0WFUyVq+8OTeXZ5hVj40Bsqd/cXLVfdby5B//1U/DiXMX1rKrfTaRQLzdfLw+U1RZPTD6YDhz6XLDdPbu3BH6tmxJMv7rH32UZJzNNki8FuJrOPVA/bcWnxgeCkth1P2V96d6W3wPTvUHBvXOwXEECHwp8Ni3Dofn/+C/VXEU+3eG+SvnQ7j9RVW/DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE7i6woZVau8ann6pUad2bnV78QFAjQIAAgdYXSBGmilULY9CvkW24tzfJ8Nd//nmScQzSfgJnZtMEUWPAOlXodCXFQ7u+tlJ3XX2vf3SlruM64aBYrTVPi9dBfO+LXw8Pbq97qNMXL9Z9rAMJEEgr8Mj0Y6F3oK960EIxxGCrRoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gU2NNRaKCz8Wnaq41MToW/Hjmy3bQIECBBoQYFUlRyfHHtoMdzVqCWO9t2XZOhrnwu1JoFsw0He+vhqslkfeWhXsrGWDzTc2xNipdYULQbWG1k9OcUcN3KM1yqB31gltd4Ww6wx2Bpfr/i43nbyvMrR9do5jkBqgXinkX2HDtQMW9xeqepcCbdqBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTWDDPl0rP3JwqvLpXqVSa3WbPPx4dYctAgQIEGhZgXgL7rn5+dzzi5UrZxJWmMxOaPS+NKHWqzlCbNk52W4vgdcuX0lyrcdVH6xUJm5EtdZju3cnQ40/29rqAnkDpRPDQ2H/0NDqJ1nl2dkbn4X4pREg0DoCU8eOhBhurWpd5VDo6a/qskGAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwN0FNizUurBQW6V1aNdo2L1/791n6xkCBAgQaCmBGGh96YMPk8wpVmvdVsoEQRKMHKtXxq+8LVZlVLkyr2L7Hn+9UqU3VudM0WKg9R9P7s9VoTM7jyMP/UIlLDuS7a5rO/5cn77gtvb3wjt5/vy9dln1+cVKrffXX1k3b6h21cl5kgCBugR6B/pW/H22OJjm/bmuSTmIAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJsJbEyodWxqewjz38la7ZupvV1jdh/bBAgQINBaAicS3f46Bv1+ffqxpMHWeFvvp//GZBKweDt2rbMFnn/r7WQA44Pbw688/I0k48Wx/t6eh5OMFQeJQfUY4tVWF4hB9zxB5/j+lKdi7+mLgserv0KeJbAxAo//yt+uOXGhuzcUtub/A5uagXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2IQCGxJqLZeKT4dQ6F7uubVnW5g8MrO8y2MCBAgQaAOBc9c/DakCn6N994V/lLCC5T/Y+2iSKq3xZXjr46tt8GqYYiMF4rUev1K1WF31ybHduYbbPdAfnv7FNMHtpYmkCqovjbeZv588l69aa7028T03hmo1AgRaTyDefSR+ZVtx+3C2yzYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisIbEiodX6h8Gx2Lt88dCCUyulvO509j20CBAgQSC/w5++8m2zQvTt3LIb08lQwjBUQj3/j60lvx/7a5TS3nk8GZaANEUhZrTUu4Jcr1Vr/5czjIV7362k7tnUv/pz8s4PTSasbn7l0OWlwdz1rasd9o9fNubmmT/30xdmmn9MJCRBYu8D08WM1OxfuGwyhq1zTr4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWqBpqdIu8annyoUwlj1NEJY6YO/7D62CRAgQKA1BeItuGO4a2Lo/iQTjAG/f3V4ZvE26C+8896ab4Uew6wzu75WqX75UK7bemcXEdfnduxZlc7cjtd5vB7WG0JdTWu4tyf808d+cXHcWPkzXmvvfnItzM3PVx0Wq7JuK5XC/srP2cHRB0O83lO2eL4/feNnKYfc9GNFs9MXLi6+7zRrsfGcZ2YvNet0zkOAQB0C41MToXegL9z45PqdowvFUBwYCvNXzt3p84gAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRqBpodaC4XCMyEsVE1k6UO/qk4bBAgQINBWAjEMF4N+qYJ2cZx4e/YYUj2xGPT7olJB8vpiVcSlwF8M+cX9xgcHK98Li0G/PBVe7wb+ow8+vNtT+jtQ4E9ee32xumqqa32JMP78LA/Lxgqg565/GmLotRHX9dJ5l74//9Y7bmm/hLGO7y99eK6podazVz8Wsl/H62NXAhshEO9AMlH545wf/9nzVacvDtwf5q9eqPw6XP1HC1U72SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ4QJNDbWWHzk4tbAw/+2s+dSxw9ku2wQIECDQZgJXbt4KP3zv/UqV1N1JZx6DgzHYulEtBmhjZU6NwJJAvNZjBeHj3xhb6mrI91iVdXxwe0PGzg46e+OzxZ/fbL/tewvE4HH8Gu277947J9gjVvPVCBBofYHJI4fCT55/Icx9MXdnsl3lUOjdHhY+9e+KOygeESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgWSHvf2uqxa7bm5+crVVqr29Cu0bBrz3h1py0CBAgQaEuBWOnxzUoVwc3S4m2+n6tU5dQIZAVeeOfdTXOtx4qwf3jm1RCvd60+gRPnK5UXm9Dia3Tm0uUmnMkpCBDIK9A70Bf2HHisZpji4AM1fToIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIELgj0LxQ69jBkUJh4ek7p/7y0eQTM9ku2wQIECDQpgIxcPX7L7+yaW5hHivPxgqMGoGswNK1Hiuctnv7L6+86jrP+SKevnCxKaHgUxdmm3KenBwOJ0DgrwWmnqy9I0lha0+IXxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisLNC3UWi7d/tUQCt3LpxGr1+w7dGB5l8cECBAg0OYCserj751+ue2DVyfOXQg/OPtWm78apt9IgaVrPX5v1/bca2+E1z5yG+y8r9/1zz9vSgXVly9ezDtVxxMg0ESBeFeSka/vrjljcceDNX06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBD4UqBpodaFEL6bRY+B1lK5lO22TYAAAQJtLhCrV/7HEz8J7Rr2i4HWP/6rn7b5q2D6zRC4cvNW+N5PTrVldeLvv3E2vPTBh81g6ohznDx3vqHrjMHZs1c/bug5DE6AQHqBA8eP1gxa6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEQmhJq7RqffqpSpXVkOXgMs04dO7K8y2MCBAgQ2EQC565/Gn67EmyNob92agKt7fRqtcZc47X+O//3VIhh7nZoc/Pzi6HtF997vx2m2zZzjBVvY/C0Ue3M7KW2r4DdKBvjEmhlgd3794Z4h5KqViiG4vbhqi4bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh8KdCUUGvlVDVVWlf8cM+rQoAAAQKbSiCG/GLYL4b+2qEJtLbDq9Sac4zh7RjifveTa605wb+eVaye/PsvvxLita6lF3jpg3PpB/3rEU+e95o1DNfABBooEP+Yc/r4sZozFPt3Vv7us1m/jtecXgcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBlhVo+Kdo5UcOThUK4VBWYKXbMGb3sU2AAAEC7S8Qw37//scnwg/OvtWyVQZj0O/5t95ZrF7Z/uJWsFEC8TqKwdZWvdZPX5wN/+4vfxxiRVGtMQInz59vyMDxfbTVA9MNWbhBCWwSgT3TUyGGW6taVzkU+warumwQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIBBC5pO19CTz8/PPVkKtVW1o12gY+fruqj4bBAgQILC5BV54591w5tKl8A+/uS/sHuhvmcW+9MGHi4HWRt42vGUWayJNEYjX+umLF8Pf3/to2LtzR1POudpJ4rX9x6/+VJh1NaREz8Xw6ZtXPw7jg9sTjfjlMPF60ggQaF+B3oG+sO/QgfDKiz+qWkRhYDiEax9V9dkgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OkCjQ21jh0cKRRu/2rlvopVzivdfrFqBxsECBAgsCkFZm98tljJMgb9jo3tTh78Wg9arFb5p2/8LMQ5aQRSC8Rw4++dennxGo/X+kaEW+Mc/tfbb4dTF2ZbtkpyavdWGC9Wa00daj15/kIrLM0cCBDIITD5xExtqHVrTyh094aFWzdyjOxQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptLoKGh1lJp/ulKoLV7OVmsUjM+NbG8y2MCBAgQ6DCBGCiNX6N994UnK4G/Rysh122lhv4naVF4qYpiDJ3FaooagUYLxOssfjXzWo/nixWIT1+cbfTyjL+CQAwR/909jyR7Tzt3/VPh+xWcdRFoN4F4t5Jde8bDB2+8WTX14uBIuH2+uq9qBxsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkygcQmisaOVMOu1Z7NVWicOz4RSuXGn7bDXz3IJbEqB965dC1dv3cq9tptzc7nHaNUBUhmdu359Q5cYw1p/+Mqri3OIlSz3D90fJoaHQt+WLUnmNTc/vxiefb0SoH3z6tWmB8PiNZgiPHvps8ZWk71662aSeTb6Zy7VdZ/k4lrnINlrPQa59w8NhR3bqv72Z52jfrn79c8/D69dvhJe/+ijyteV0OjXYbVJtss1v9oa8j4X33dOX7gYZnZ9Le9Qi8cLJydhNAiBlhCYOna4JtRa6N0eQlc5hNtftMQcTYIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDARgs0LF3a1XX925VA68jyBcYw6+SRQ8u7PCZAgECNwFLIseYJHV8JbEajpeqtz732xmKFw1jZcrSvL+zo3rr4fWnxpWIx7B7oX7yd+rufXFvqDu9+8kmlb2Fxe/bGjcVg9PLnv9qxiQ9ikPF7PznVxDPWd6oT5y6E+NXqbbNc90vX+vffOLtIHm9VH8OtQz09lWt74J4vQwyjX7n18xC/x2tsI0Os2cm2yzWfnXfq7bW8jms954lz59e6q/0IEGhxgXjHknjnkhufVP9RUazWOn/5/RafvekRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaI5Aw0KtCwvhNwqF6kUsfYhX3WuLAAECBAhUCyxVe0xR5bR6ZFsEWk8gXudvXm29eZlRfQJfBvLvq+/gzFExAB0r8WoECGwegenjx8IP//ufVi2o2DcY5j/6MISF+ap+GwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6UaDYiEWXHzk4VSmkN5UdO36ApxEgQIAAAQIECBDYrAIzXxtNtrSXL15MNpaBCBBoDYFvHjoQ4h1MqlpXORT7d1Z12SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQqQINCbWGcPu7WdChXaMhfmkECBAgQIAAAQIENqNA35Yt4eDog0mWNjc/H05dmE0ylkEIEGgdga0928LkkZmaCRX676/p00GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgEwXSh1rHDo4sLISnspiP/8rfznbZJkCAAAECBAgQILBpBJ4ceyiUKrcrSNFioDUGWzUCBDafwL6ZAzWLKmztCYWe/pp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6TSDNp+7L1Eqlhe+EUOhe1hV6B/rC7v17l3d5TIAAAQIECBAgQGDTCMQqrTO7vpZsPSfPn082loEIEGgtgXgHk5V+Py4ODLXWRM2GAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwAYIFJKec+xod6l07e1KqHVk+bh/86lfDgf+zrHlXR4TIECAAAECBAgQ2BQC20ql8I8m94e9O3ckWc+Vm7fCb/7FS0nGMggBAq0p8ObpM+F//qc/qJnc7XdfCQtffF7T34kdC5VbwGgECBAgQIBA4wUKhbQfETR+xs5Qr4B/X9Ur5zgCBAgQILA+gU7695V/X6zv2rA3AQIECBCoV6BT/n1Renj6K6KklVrL5Wu/mg20lsqlsO9Q7e0Vv5qBBwQIECBAgAABAgTaVCBWaP316ceSBVojw8nzF9pUw7QJEFirwPjURBh8YLhm98LAAzV9OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0kkDSUOvt24Vns3h7DjwWegf6st22CRAgQIAAAQIECLStwI5t3eH4N74e/vmhg2G0776k63jpgw+TjmcwAgRaU2DyiUM1Eyv276z8nWjSX9NrzqGDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQCsLlFJNrvzwLx2q3BxwKjve1JOHs122CRAgQIAAAQIECLScQAyqDnZ3rziv0b6+sK1Uqnx1hYcGBsLugf4V98vbeebS5XD9c7cez+voeALtIDB5ZCb85f/4QZj7Yu7OdItdoThwf5j/ePZOn0cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkggWai1Emj9btZt157xMLRrNNttmwABAgQIECBAgEDLCRx48MFK9dWxDZ3X82+9vaHnd3ICBJonUCqXQgy2nnrhxaqTFvoq1VqFWqtMbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHSOQJpQ69jBkRBufzvLNnVMldasiW0CBAgQIECAAAECKwnEKq3nrn+60lP6CBDYpAKTTxyqDbVu7QmF3u1h4cbHm3TVa1tWoVBY2472IkCAAAECBAgQWJOAf1+ticlOBAgQIECAwDoE/PtiHVh2JUCAAAECBNYlUFzX3nfZuVSaezaEQtW9WnsH+sL41MRdjtBNgAABAgQIECBAgMByAVVal2t4TKAzBAYfGF7x9+Zif6Vaq0aAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgAwXyh1rHjlbCrIWns3bTx49lu2wTIECAAAECBAgQILCCwIvvva9K6wouugh0gsDk4cdrlhkrtRa2VP3daM0+OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAhsRoHcodZy+frTlVDryHKcUrkU9kxPLe/ymAABAgQIECBAgACBFQSuf/55eP6td1Z4RhcBAp0gsHv/3jC0a7RmqYX+oZo+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2u0DuUOvt2+GZLNK+QwdC70Bftts2AQIECBAgQIAAAQIZgT985dVwc24u02uTAIFOEtg3c6BmucWB+yt/P5r7V/aacXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGWBXJ+QlR4+cLRYDDUlWSefmGnlNZsbAQIECBAgQIAAgZYQiBVa37z6cUvMxSQIENg4gckjM2Frz7bqCVQCrYvB1upeWwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2tUCuUGsI889mde5268TsfrYJECBAgAABAgQIdLLAi++9H55/6+1OJrB2AgT+WqBULoVvVu54km3F7Q9ku2wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2NQC9Ydaxw6OVO6F+FRWZ/Lw49ku2wQIECBAgAABAgQILBP4/htnQ/zSCBAgsCTw2LcOLz288720JRR6t9/Z9ogAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAJheoO9RaKi18N2vTv3NHGJ+ayHbbJkCAAAECBAgQIECgIjA3Px/+8JVXQ6zSqhEgQGC5QN+OlX+fLg4MLd/NYwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbWqC+UOvY0e4QFp7Oykw9uUJlmexOtgkQIECAAAECBAh0oMC565+G3z31cjh9cbYDV2/JBAisRWDqWO3v1IWe/lDY2rOWw+1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoO0FSvWsoFy69p2FUKi6B2KpXArfPHSgnuEcQ4AAAQIECBAgQGDTCsQw6/NvvR3OXLq8addoYQQIpBHYtWc8DO0aDZc+OFc1YKFvZ1j4+WdVfTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIbEaBukKtt+cLv1bM1HidPDITtvZs24xG1kSAAAECBAgQIEBgXQJz8/Ph7NWPw48++FCYdV1ydiZAYPKJmfDCf32uCqI4cH+Yv3ohhNtfVPXbIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILDZBNYdai2N/9K3QyFMZSEmnziU7bJNgAABAgQIECBAoCME3v3kWjh3/Xp479q1EB/P3lBRsSNeeIsk0ACBfZU7oPz4z54PNz65fmf0QjEU+wbD/Mezd/o8IkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQILAJBdYdaq0EWp/JOoxPTYTBB4az3bYJECBAgAABAgQItI3AyfPnw5tXr95zvjfn5ioB1k/vuZ8dCBAgUI9AqVwKe6anwqkXXqw6vDg4ItRaJWKDAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgMwqsK9TavXdqbG6u8FQWYt/j09ku2wQIECBAgAABAgTaSuDKzVshfmkECBDYaIHp48fCKy++FOa+mLszla5yKPRuDws3Pr7T5xEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBTSZQXM965uZKz2b3jxVaY6VWjQABAgQIECBAgAABAgQIEMgv0DvQF3bv31szUHG7O6TUoOggQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDYVAJrD7WOHe0OYf472dVPPnEo22WbAAECBAgQIECAAAECBAgQyCFw4PjRmqML2/pCYWtPTb8OAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptFYM2h1nLpWiXQWqgEW++0rT3bwuSRmTsdHhEgQIAAAQIECBAgQIAAAQK5BUa+vjsM7RqtGafQf39Nnw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECm0VgzaHWhRCeyS76m4cOhFK5lO22TYAAAQIECBAgQIAAAQIECOQUeOxbh2tGKPbvDKGrXNOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBmEFhTqLVrfPqpSpXWvdkFr/QBW3Yf2wQIECBAgAABAgQIECBAgMD6BR6Zfiz0DvRVH1gohsVga3WvLQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbQmBNodZCYeHXsqsdn5oIfTt2ZLttEyBAgAABAgQIECBAgAABAgkE4p1RJg7P1IxU3D5c+bvTNf06X3OsDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKtLHDPT8HKjxycqnxaVqnUWt0mDz9e3WGLAAECBAgQIECAAAECBAgQSCoweeRQiOHWqtZVDoWe/qouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2g8A9Q60LC7VVWod2jYbd+/duhvVbAwECBAgQIECAAAECBAgQaFmB3oG+EO+Ukm3FwZFsl20CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECbS+weqh1bGp7CPPfya5y38yBbJdtAgQIECBAgAABAgQIECBAoAEC08eP1Yxa6O4Nha09Nf06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwKqh1nKp+HQIhe7lC4xVYiaPzCzv8pgAAQIECBAgQIAAAQIECBBokEC8W0r8yrbi4APZLtsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE2lpg1VDr/ELh2ezq9kxPhVK5lO22TYAAAQIECBAgQIAAAQIECDRIYMVqrb2Vm6t0lRt0RsMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaL7AXUOtXePTTxUKYSw7pZU+SMvuY5sAAQIECBAgQIAAAQIECBBIJzA+NRHinVOqWqEYigNDVV02CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwF1DrYVC4Znswlb8EC27k20CBAgQIECAAAECBAgQIEAgqUC8Y8rUsSM1YxYH7g+hEm7VCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECGwGgRU/+So/cnAqhIVvZxc4dexwtss2AQIECBAgQIAAAQIECBAg0ASBfYcOhBhurWpd5VDo3V7VZYMAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwqsGGqdn5+vqdI6tGs07Noz3q7rNG8CBAgQIECAAAECBAgQINDWAr0DfWHPgcdq1lAcfKCmTwcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBdhSoDbWOHRwpFBaezi7msW+p0po1sU2AAAECBAgQIECAAAECBJopMPVk7e/mha09IX5pBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBNpdoCbUWi7d/tUQCt3LFxarwTwyXVsNZvk+HhMgQIAAAQIECBAgQIAAAQKNFYh3URn5+u6akxR3PFjTp4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwnUhFoXQvhudhH7Dh0IpXIp222bAAECBAgQIECAAAECBAgQaLLAgeNHa85Y6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDsJVIVau8ann6pUaR1ZvoAYZp06dmR5l8cECBAgQIAAAQIECBAgQIDABgmMT02EeEeVqlYohuL24aouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaTaAq1FqZfE2V1t3799Z+WNZuqzRfAgQIECBAgAABAgQIECCwiQSmjx+rWU2xf2fl71Szv+bX7KaDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQMsKFJZmVn7k4NTCwvyppW3fCRAgQIAAAQIECBAgQIAAgfYSmJ99J8xf+6i9Jm22BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECHS1Qenj6q/WXlh7Nz88/W/gq4rrU6zsBAgQIECBAgAABAgQIECDQLgLF4bEQvzQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC7SjwZYx17OBIqXT77cp9CrvbcRHmTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0N4CxTj9Umn+aYHW9n4hzZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0M4ChTB2tLtUuhartI6080LMnQABAgQIECBAgAABAgQIEPhSYG6u+GB458QFHgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAu0k8P8BZhdTehAYdQ0AAAAASUVORK5CYII=" preserveAspectRatio="none" id="img2"></image><clipPath id="clip3"><rect x="0" y="0" width="7588716" height="3314007"/></clipPath><clipPath id="clip4"><rect x="1237" y="5547" width="789" height="241"/></clipPath><clipPath id="clip5"><rect x="1237" y="5547" width="789" height="241"/></clipPath><image width="180" height="207" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAALQAAADPCAYAAABcKPswAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAALSgAwAEAAAAAQAAAM8AAAAAyu+mMgAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAK9ZJREFUeAHtXQl8FEXW7+qZSTI5CTcoggcC4oeiIHihqLuoq+vtrqiA4ooHQsIRbghnuEXEAxFvPFDU9VpZFQQRQfFAuQQXQQ7lTAJkMpPp7vr+NclggJme6umeme6Zrt9v0p06Xr169e/Xr14dTQQ7xFQCY3f7biGK2F0R6Pjik9N+iGllNnGB2DKIjQQm/uE7W5HJY6DetaYGBdJ+xkFcY4Y3IftiU6tN1Qa0wRgo/pXWcaX7R1NB6AvSzhDky5FW3LCJ68k+hPhDpNtROiRgA1qH8GoXLaZUdO329yJEKAFgG9ROC3O/WRRIwbCmro/DpNvRUUjABnQUQju+yIQ/qjqLijAb8R2OT4v8P/0P8gwc3jR9U+S8do5IErABHUlCKukT99BGDsU/SaBCL2TTI0u/QOns9Kq0iYWnkjKVKu2kCBLQ0wkRSCdv8lxKXeW/+/tSIoxBK3MNbOl+mCzDPY1czxUTohhIN2VI2YDW2NVT9vj/QhQ6C8XaaCzKn50IawVKCoqauJbxF7JzMgnYgObEQcnvlS2cgnO6INCbOYvozobOWegX5KHDmri36SaWIgRsQEfo6Jk7qJs6q4bg2S9C1owI2WOR7MVDNFUhadMGNyYVsaggmWjagFbpzRl7MMtHyQxkOUUlW3ySCNlFqFJU0CjtdUIIPIN2CCUBG9AhpDLzd19bgYiPQThXhEhOaBSQ/JVIaUFBk7RvEsqISSu3AV2rYyYfpHkZVRLzXISb5auVO6G3cLCQFxSHY2RhA/J7QjkxWeU2oNEhlFLy+B7/PVQgk/BvQ5P1kRo7R+ADn0COOB/r15L41DKmSlrKA3rOH1WdKCFslq+jdTudbAWwBzzS2PmeddtgDOcpC+hZmOVzUnmSQGgviDIp5IBGfCZSpfDBxunrjIGH9agkRUdqETsWETnr7/H3haegGOWMnOXTwkYs88rQ1k+m+51j72tGDsayIjPSTilAP7nHfxU6gc3ynWXGzjCYp4N4+4zZ28A1F9PoksG0TUsuJQA9Zzdt7nJKM+Dyitssn4l6fD2zrx9o5PrERDzFjJWkBjSb5cvKkIrQoWyWzx0zKVqAMB7md52CXHRfw4xfLMBu1CwmLaDn7ZFuhrMWs3y0edTSSb6CVWjSTAdxlvRuQA4nX/OSZHRfu2OeO+A7S5ZFZicze9kOoSXwBxXo8H81cL2YbNPoSaOh52KWzyFLo9F/j+AXai9f6K5N4Vh0/hpZEQrub+RamSxisDyg2Szf8/v8vQRCStApVprlMxGG6AJZdA37Vz2y00RMRcWKpQH9/P6qjlgI/zhafkFUrbcL1ZaAh1JhcrbPOeP2ZqSydoKV7i0L6BcP0XrUK32AOb5OVhK42XklAv1NoOKgng2db5md11D8WRbQrDHM3Hj5gL8nFhVNxL9NQjXQjotaAssFhRb2bJj2fdQUElDQ0oAOymvhXprtE6Vh+L8Qv0TsKgmykmxXnPZEnvVXOUbda5HTnpIC0EEUvVZa2UKWndPw/y3BOPtqiATKsUlm3JG6rjlmP+3JtIB+9YD/auzj74sDXKZ1b6Bt9/OCvf4ughhYs3GuId1pEwlKYDMVSeFddZ3scBxTBtMBesFeb0vidEzHdPX1RyVG6asydQ2+W8PuDKyqE1sd9PeGfT0edGx33lFh6r/BNPrH2N84oHt98532ZBpA/3sfzfESaQQAWADPRVoIsWOqloxtVFec3VXD6rFXDtBcpyCNhHuvXxi6IaqyozgkgIMm6Zx00Tn+pnzznPaUcEAzT8WbB+SeWHfBJkYaRRYkWU9Epe/t+drMEAwczxCc8nT4Wv8euQ47B7cEqLCfiGTkhnzxWSxTTfhpTwkF9MID/guhddkZypoPOQQwX3VSx+BbNJghrJPe2O+/ihBxJrTL2dydZmfkkABZSxSl8LYGrs85MscsS0IAvXA/PUkUZXbs7J1omR4eDuPcjOJ6dcXHtZghCyl1iKXKA1igU4z668VMuqlJ+E2JOIbckU+2JaL5esCkmd+PKE2vKFUGYjaK+YyzNBMIW4CsU4jyiFYzZEEZzU+XcWwBIQ+BtL2gKax8NSd4CaXTK/zOKT3ifNpT3AC9qFS6Cef94Gw44VTN4uEv8CqVtZshb+2nrfHGgBkiXM1flZ2TQwK7kGfoTfmOV+O1TDXmgH77oO9sQh2zYFjE6xSigBmSp9EMYZ3zzkHpGtjW2BRAWnN0lp2FXwKrFEXpf0v92J/2FDNALyynddMUuRhtfhA/B3/bjcoJM0RQHrlZozdkKXaFl5UpfWEWjQIn+UZxY9PBzIIgvChLjhFaB/JaZGc4oNmAK6NU6UMxVQpG6mphJkZ5F0iSo0irEN/Gaj5RkcfBTLoffCXggYyRNBJP9gjMj4nOPHHWtcT4054MBfR7Zf4rCGXbn0znEjuMCZvinDravCGs7z+AySQQx0yoF3tLl7EPw1aQG3R9vvNdI8kaAuj3yuiphMrTQMzkxwRUmyF/12iGMIF/WCrdADcfFj6RM4zsAJuW8JlA5cLr6hpz2pMuQC/+g2ZJ6dJQuL0GomOss2yTCgscaY6iq7O0ndwJcyotq1zpjzfQCLQ31wajYRKQMVH2dJXDUXxzLjmgh2rUgP6oXLoTZv5kVH6SHgYSWDbgDcmIwgxZfJg2VGR5IsyQe8C/mMA2JFvVpVCOY9y54tNaJspqC0EzoD8sq+ogMjuZCBfVJmTde7KOUuWRa6MyQ6rOFUngyIQu1m2/CTknZAM7dLJbHe2nPXED+tMjtJEsyWyrE9NK3OVMKK7QLMEMoVGYIYzYf8uk26Ctp+C2BfvfDgZJgAjvUcEx6Oo8wn3aU0RgMrsx/7DSD6viRoLNZLcbD8EbMjYtV7s3BP7rDKlcGYgDEocA3NkGdalNRhCqINNZlT7nxBs4TntSBfQn5dLfMADClDBpmVKSpfCGOJS+3XJcy7W2++MK2kSU5BII9m6UVZWvVtopnn8PpDn8qhzHC2rT6CEFvuQQbaVQ+VEIMNXXNiyQnNq9IQx4n5bjzBAhYF9jiawdDJTAt9jf2P/K3NCnPR0D6E/YcVoOdpwW6QsGXAYyYWVShyCPsSQKM4Q1ekm51B0X5g06mf1vB8Mk8BqRHEO6HnfaUwDQbP/d5Yfl3qhqAuy/BoZVmVSEAktU+14ZhRny/m6amZ0lFWFnx2D4W1P6WF+DIeHB93GmZmSL0y4i1ac9kaXlVZ2J6HgCp7a0N7iyZCW3gIqOwV2zyR9aG7jMQ5tRWZ4M//0dWsva+VUl8JtChIFX5DgXkWXl8usYRTbFa/ViFDnGBFElkdqJh7BoaawcpRmy7BCFD5+NUYiFv7xlJgDQb8GNfFmuszNZfkjBGQu0GyLYl5PYrFcqfH8EzTQkrKNE7HtZDtHsDWGbg784LPcEqJlvv4kh3KQaESrshDLGiamkM5q+p0uuowlZwQBNAoBm4sCuXfolMrTBfX0WYQcOCWBSxi+KUZkhS3GMmcutDEMt9jFmHKKuyeIDRr8CVtlBncExyd5LchyNA+sQmJ1R8xPxyd1Lce/CAvfluMq10oJ57Ouf8mKyUAgRPGmUKQPtoWtDcgQdMcIhiWdB9otseR/FYkiciYR+DXNvD/B5OWTlri0vJn1nANEn9kMeyLL1CZvxq8TvnBOz2DGQ0RKRkAGds8mPeqVxYfUu6dtWHqaX4RlhcwD2MWa1hUqE7dDIe3Fg0AVQIGGDk4RBdE2JMwNXAjOEktNwb9t61YL5BTtyii7Kchi6OJ2RviiHLIN93WGVB25Uah9jBpF4oDi+BpgvxLV5QG1X90HIv+E09LGZacADcgQEPwdhjNBDHtV1bJnk/K8cfvqJOVni7LaEVMWqiZjaZebLvFUH6BskXWFraPrhF+p4tFixYA66lK7CctJmwNzlHAwF9Db5+shRLwdHmUCWbahgH8CdSi4ntgB9vksQR52XQ9D2+IZvy+kZkkinY8r37/GtOUG1EWEr7OQyKI/zNHCw74JsRyOnijkSjlYLgBk/itcAaSwIwin4JXEgn2GwNqBDDvkpUY08v3r55I3fHGGfdmbHmAnJeowZswLWYJKPzYmcpgWbAH8gOJlNoqVgTTlcCPtQjxevhM/BALvP/DMtCe6osIXZyR2zHP/W25rVXnqmI13Y2YEQ2IPRh47Zrk+xnLf9aR6lD6iMxS+JjjELuItP5zQvThBiEMbkuwrNJscJxBCxC4zsAMiZg9vqoQxT0xMV2MkAoF9PY1bjKASXQykGjQfw2w1//5DzMp2v66EZLPsjjjHzO5UxGPE/hDhnMN5qV5w6uxnmhRGetP3nZTkakh8AaOxm7maEIPCUfAdbMxcq34o7o2XIYB6+Qju6Qy7Zr0ceWOzvzKtUHkRHFYPOcYfV0JXYhVFwXhZerQaENYdoa4eDnfZErzGAXDxJlOPt/gPe7pegUiPOPdnfPghoCMMQQNdIQwKjK8AoM+hz4ymh6OsinxCFDDwnh7Dpf13hhyP+bkLguF6BzbaGC8zke8FFxRFto1jkFIro9xUSzm0hM/B2aRUq3URx0J8wLwhhfBq5svPAuVmOBmRtJUwOaiigg7KDN4D+DDPEzIueNqPtg8/Jcr4fZDra6w+wk4lCp6PN1/HSgAY/jLfZJLdbnNXSgFOE1lDqclUqD6P+0fjV4eUjXvnQ3o1UFGQ8dGfHoM4D52QC0D/GDtBBnn9CA9jgU01jBfPG61qGisZXucU5eu3kNdgU4cpQRsLcYr7iaDdFbMWDNahdljGnCMEMqZ/uVMbiNWCWY8wO4kHH248w80J9Ki9KBED+B/6PAXpd7AHNvCiYKKArsAGVPZl1o+TZiGLMTp6LdRNj2ug80ASzeeJ6NptHyATQNOjVSZYoCilsl22Mi3DDYXq2LNJH4b++0gjhaaUR534/eHamoz5ZHwdA1xJEKYANf27ADDFiIFCLdIRbQv4rSGRg2xyyPkLOiMkbK+llCo3Zegu2IGwuMeChCzZkfYV0Ax48mEPC6cG4mF8JliMrGOzF7818sC0D9EYA2igvB6+Q8Cr8WaSCDy6bdrxlos0XqIvQwW0ynB9ESyNYDkBugUV1U/D/bcG4GF5LIZ+xe9PFJ6M9Rag2b+txHAWpxDFmRBiB+JgN1vEw7gOeNtWYF/g3buHgWe4aQENrGunl4G4BGv4lZuGYi68RdyH+jKVwIY73uMUn9NrJaynNSvcq7LyNQag+g58F/TlR5yaRKANaZ7g+1k9NEH7CgUFOhzIetO7Fz0h7FuYcWQHz5lzIHas14x5K27gd9cjPzORIEKBrmnwYgsAWmsB0Z7SDqtrSkzAInatIYrEBdjL52SvfiYduMipoWruSuN9T8qEikoFtMshmI+reUkHby6L8KNrWRTc9IqyFzN2gU706UzfBqAiUtgoCGu+FhGjo49jeCj5KoZHOPy6e+1+UXSyi01umkw3chcJk3OzBbgic2IPkTmGyJCLaDw34uJgujj+dkHIjGNjiobdhip+ZUS2ioPcHyvwPP+aaTXQoPdMt1iNbvExDmwLQ1QIhwio87ewMC/bjDZtEgQw6PYN8xFsgXL5NFbSpw6mUYC3yXcgTTxswHEuh4vdBs446PV14tmapaag83HG/4hgzuUoYCM/NUBTK4ijIlgRgq17g+5LZHPljn4UKpS0ZoH8xG6Crm+7FZRV+TDuyV1nIAFutFB06dke6oHvgFOxU+IN5OzUkT/GMxNO2VqGksKWbfG5EvexhdjqVSXiY7wa9cA/zd0hjkzZsw4eZQtkZGWJdshWAxqvaDCZHKOHsxOKbnQBu5+MSJewpe8qVLo5tRgic9vrCVh+9FdppKqi00EcpYaXfEikpOtVNthnBwf+qaEc82LPwpsQukaNhF/piR4i+OJohwTdlpzNA/2pODX2MbDBl+i1cWGyRD9MK/4FWHtQinWw8JlMU//xaRc8VFMr2710WRXGzFWEfu5yRkSFObkxIhRHMbfPR7njQi0FrJ35mXyJcdioD9DafyWzo8D3hx+RAYYs08mT4LHwpWyhtkOZTxuMYqftQwkjXFR8Dsc21Gw/8sFNcwiu44uWrL2z3YdAo0Df0UYlL6fIW6WJ+oDOZsWSBnwuvQLYGI+rAFu/85qUD0qroZjwc96PNOLbBEm3X0j9NYS68uL2KfoU30PGmWlSys4qMWOP4NslGJQbjC+lRpdu89G+kCuuGiXAm66BkD2jjBQD2lzt88gJBFoc1yyTYhBFdsJK8nGxBhe73UnRy0lwqGj53+2gbvDLxncHEzIZqbqSxBYBFuB8d9KadPjrFnyZMP5UQ5kHiDvFdcMPNVqiMgefOyVLUDu4IVTJRcVj/wR22U5rv8Ctj8LgGtihZSctwN5I/I3zLdFyaX+gNYBednE7e5C9qDXwE+zeaXd9aZJGQvL/7aVfqpwvRFfUYA8HGJoQZc1XaHOPEN3ZXKQ83cZGrMGiUIrFnpTc4a4ulAM0LTCjy9sgbAHOkDkvR9C6/Vx9cExHQkA88ppYIATYtNSjktTgsZPclDClspwNPsJ6G1uM64JGIgXlETl5ZNquoFQPFExtSQLR+b3ZsWAtF1VImR6gGhIuzyGsyHPumireILKtNDiY5izDM7V4MaGhTQcK6zFjO5OB8i5uiR7TwapWH1BSCVWfCUtab04Ge5x1sqbc79qm8fAZsbd7MsWfb0jUENLSFtEP1xIpFRG5r6MR0lEXwHGAzKQeFVrP7EgNT/lotAuhAgywFaC1WhJU6gR9a8c9pNeXgxKbSpDOiA6aJVRaoxB+jgRo1fCyH4sNICeJSU7UBJpNSQzNAa9HmmsSWYpmtp6Et1Pn2oDAxT5Ml9HONaCzl5dDSnVbqBC3tSkRei8iylslhFY45VbTthzYO9g7YHNjpbfZQCTOfnW5VvXzUInjmtoutZveZHS2ceiQhzQB238GZFgMyCNkeALSZmT1eQlp4tcpDenwbzfa/iRX0BixC6u8i5LPaMrOUl6M242r3DPjmf0uqtcBcaSZTDuUwL8ajj2eH2nHjTEI3dAAN1nCdmgu44bgxiSyZjnoRQB4KIO8Nx2tAQ5vsCQzHK3e8FtOEm2iKZmTjEROEr8FDPwCZXVWD3feq4rETEyyBPaifnW51IQ+YGa/OgE/GKgYn78pcfKLItqKZDFRC4MwolfRgEvsIW/z3YGHzLp0jiGIxgHwoyIraFWfwsWOAs5yCHOh9tbzmSeO1jRTWJqs8pQkSr8hrTGA7bXwh8qmgOPqTdJHrME4AmQiyfJcgySXo8/ucVEHH8wIlQbIPVouvvQZvI1wVK0wGRGhDbJN5JYnPZGLDcTyUA9mGagaTdOci3pZTv/9C6pfYVxY6sjIYvFKYHHj8eF/lvDXFKh/hVBWBPfrx6IRYNdRMdJmG5oZ/NIxXQstOEzPSJsO84DqmjHo8zRTiLKEy7X5shYQBGlEWmNsMMM4rV4UhmjfzsSKx/ztOAuwomthp6LcJUQaKbvf242oN+S+Anyl4qwYDrkU4C9t9glUpKgB0wN4MWd58kZwKuvrDtTagDepAnHPJLXjeKtfjZduf5LiX8BQI2MkV3juox8vWa5wctozENHRAmVnk9czr4Wfyt8pbJ2zvmCRBhormlXtkltks31ghyz0H5gXT/REDLfdcQI9UMju58wka+cTSCpaPWmcApXC+TaqzWeQhPbFT4hJDcLgdT6jWd7plyQg8jw+IDifZOWFn+WrzAzv5JEWiE4HOuwFkrtetTGRmcuhmtjYfsb3nahZYYK5I3ryx5dj61HW+wWGvrHYQsR/Jy/qGRxh0B3UruZ6BShUdgtcs8y3zB1RmLRua450TaD1T0TagIwCBU0Uzm5SZcNrDHjjPhjvq5LwA84JLa0oHDt+ukMNTUeUp2qtja6EDXg6uuqKhb3wZLSC1bWhj5K9dQ7OPcs4RBWksqVuPb5ZvX1kHWST4TLNyMa/OCtm4AKCtZHLwql3bbReyv6OKhLOXd54CevgTIjoKSP0cvlm+fRVNFOKfAAj2IviCaFT81S4k+6nTUgMoXvcRXpHohdpNte+jlQA0NMdM4TaQH+holP8OTzX0118zlKy8QlmpGob8hn1amX2NFTY0ONb/bPC0w4A8vIwGEG1AfTYJQYB3jYZdlMn28k0RPYemklNP5Zrlk/YcuAU6fxqcES2Y2jE2BGxo1vm8QDG2eu3UwCtPYHaf4cLiqTgJ84S3od9yCOIg0rTubzytrtq1t71IxEcFmXbhyR9VHkWAH5o9JFYZQPE+0NV2VFQySZlCYZVuCAnUGmfhrb5eEcT+rpPrLwmR84QoumdPI1kSxwNn9wJnWmo9gVakCLbO1Vp+aF7biClyPk9RJBnZ6exVV63wyvAeHyue3OAJB8csH11P05Tcvf3lKjISJHLiI0gMCqnMbGhrmBy8GMWbB75TXnUeH1Gbrhbe5dB+SQE+5jvS6HDSpMk+nnZIv+25UaZ7pmEMf3pcTT+RQkNbyeTgffDYQNcy4wIeiCQuj6N5448wMfIhDwe+rXvaYXr7USorXXnyG55HCqyHtpAm49W64Qcyhssw2QnyzPLRLb83kEU6jgryfVCQvLo/FqKzmA3Nq6EpXQWzbwcMqWaxkFoS0HxP+Llpld520DVrXFJe40ckoowCkPMCb3u9RHWVZ4PCJFzI4zrjpJV09+420mG5COPIwZCRW5eckqUwEdbBpzzAdeZJn+ptkrRl1/WSoEzHgLGlabxkxIGZQjxWvIMtvULQW17h1dCoiDRt6sGl2PPLrvlOWZ6C+3/qrd/C5Q9AImNdLU9+CiYEM8iiDr4t29sKMpmpKPJfoiYSs4IK/NABn601vBzRLPvKPOOkHZBfd/+m7U/ADGELxc+PmTzNR5gton/a5RfGkHanlOphj27cWc8vKMWCRB/AiNuRePMiRGvEKpgcLFhlYkXHc+dq3fxLbOW5wL9h+z1wgEwAqcYhRJI0Udjd/180ZkB62+Yb9DSK2cn+zAYP+ak0GnTy9dCKdVnAuMaGjnVNRtHXaRvVjNifo5s2vVkluUYQgfQHa+lGsWcSOlvghR+UcfZp7+vlR1q/7Ro/VWZivU8rvbTiUl5ifmg2q2bK90fsREBatz4M6kO9P22Z56DidDgub4hdbXGjfAgP7ASnWDmbtG2ry4Ph+2lbG4HKMxRZvjpu3BtREV7BmFhhw0KLBA2DQp4WZfxfy/8h302Va/93pSjQR3F/Nk85k+Vh3vnnsKp+ZE6707n264Xjn67fUdcv+UZTRXoIeZyWwUWwQdWABtuW4TzwOgmyb9jVfc7pn9GFtL2/5S99UMNY2Nf1DCMeQ0LotuU4oq4w/dyW3+uphi5d6vTlN3ugqspbDDp19dBKaFnL2dAxnM4mtwfcWU/SH398ze/PGAPZBLRUQjsofOXboYWGZJzfemH4LHwp3m83dqui4gwiy2fxlTBzruBqOx3eg/g2L7yG9q3ZOIzKwqKMTm026+GJtGtXivIFvm83z4VLcyYmZrrpoWdw2QrQm5p2OH0a6cq3oD5c/d7VG88kIsHEiHBd4BVtmbd0uBahFZSy9dAWMjlUhI6tb9diUDS26uuNT7lEcTzp0Gp/+KZHTkk//0y2L+4adPx1MN2n4/7MyKViloN10quy5BiaeVGrXXpqod9/X8cvZYxCt/dF37usY25ytJqZHAzPVvFDA7DhWwXHK9rCBjKP+BS5h3f1hknpHvfjejUZNP4H8MUu9klZ/SAorO3FmoU4BrT4a0ztFWR2PmuVnmoxRnD4mm/4l6+KjEN/19dDy8RlmdtO10xofNumdgpm9ZMZ5IctlJniy6h4yLdy3fC0C9u+XuODDqZrupIOHdjW/BmHl3/3ksuZNgHrg3sD3DHdfYGp6t14hQ5Pv7Dty3p4Zw2tXLn+Sh/Z8CjWiMOLo/Ka0yQVE2aGTqtZnKSi+czEt8pxumwbfIiuao64Bd6V6woqv/xxsPvidsv1NCeny3lsgXufqi/XP6UQuPmocJkeemHKesHzzIwjcgnpdg6zmaMO3pXrz4Bbdhp+NyQzjv8UkOUW+P/J+gl3Kmul8bh2RP7PvV/8+C7WgwzNuPRcXQPHtIvb/gB6XX3L195KCZmK+xb4GREWUUke7O7afpseYnTVllyf5B2Jk+1hJglpemhZrCxem+xVbZWfmnXEPk0QuR03YifLOu+ytY8dWrpGtx2Z3uWct9KlnLNQMdYD0wqO+kPLWqFrsTj+ioxL292mB8wwUcTK5T/+y+f3bIaHZhD4SYuap8iyDN2WBJbD1ygwKFTRbGZ7OqmKyaHhwBy2IOuRNOLs4fn8+xI3rTNbz8CxpuzEihXfPS9KpAS2712gz2vD7cM4dzQejHmwk8P7JDk6onLp95f7lq1ls53nQEWlZgjMFFpqy78KThRZJfHE/kWn56HAZK9Q+qDns++Gu69or2vgmHXJebtRS0/PkjVPEkFky1Q7nVjr0Rg/6p6TLivjyF86lB+NjeLG++k3pwmiYyr68mb8oqCQREVkth6ajRasIgc1JcbeNJogfbQjm6PYAu+S7wsqP/tusPvK83QNHDOv6LAawLoI9KCpKTS20PRoTeyGCh8RwTEg4yp9djxdsSLH680chqFwIcyK9GPqSNF/qMI+SRH4rFt0SDCV3AI2tB6OaPXA8dM1GDjKQzOu6hT1wLHGzfYyXbz2ba/oHwquBuK3DfEA8vkf6+KymIrei9f08laSCRjgNraMMtLTaN6yaYFvrLDcFlHRams5DBoLQBI3CoJ4nWfx109JaeL43K4dop5xrHG7jar473dPZbrK95KuXSXevgmVr/Ljby/1kTXwJwvnWabPQjUkVnEBDW2h5aNq7xFKmQ2tlkOTFAMDR6dP7uH5eHWJO92DgWNXryYKtTJn/TVgX9eK0XZb+eGXzanDOZUK0m2pbiarSQ7ngbD10MhiFSmpaWjm0gs1taImgchpbJp7cmWl+8HKD1ePyLj2gtf0ztpFrvLPHFjWme3xuIeg2QMxy5fxZ4p9F1oCGfBDMy9HAn2HmuoOfLIrdFPQCB4/dLRtbU4F5ZXKj1athrbsEo4Do+IxqCSVH3zVs9KTsQmNGgEZZWiSk1X602A+qzU0Dus17k1tVJdGQccgGzpCzR1w6MPnnvdXviuKjqEZf4t+4BiuHs8HKy8CmJk/uaNVhjbh2hL3+KANbRnBqY1dFbWVS4aL9kZ8Yu46z7srnpYc/nG513eNeuAY5Mzz7rJm8CdPhtfpjmCcfdUmAeqSFCdmC7WVSmBuHIMZtvYETCqwgWNfh+y8+8g7y0uy6shRDRzp+2syPVJlEXphMOxk+4SnsD0cOQEn6bLlo9YBtOoneo346ExkmYXKEZhx9JQ6HjyyaNmIrJu7cA0cmZ1c8c7y7h6/h02+nGyZt2QoCZglTmGAjo13IDZNDK+gawZ7samWk2pzsPeKZ9HygspFywa5b7ks7Iyj560vOnneXv4onDKdwTgneTtbJAlQJROAZoi2ikzV3iYyvBzGu+0iyTBUegew+XnFm0vfdciYcfznVUdnHD0LPztJEEkJfOZ3wmuh9niGomvHRZAA9UND4/PI+ywjWjUIsE++mevBvFEm4nUVbyx5mjjJVOx5vBfDlSLoj8xozuiL0JepnrwN0Bha585LS53uTct7eVpd+gWwMA5SaWRuyYRHNJbBhk9MXKMCA0daRduBhZj7rxPXzMTUjA7Hjh4yNTPPO5Vce62PceEkxcVQbcK8ffNXvO7O9MKJL/TH/+ZcvaW22s7c3hpzvTtYz1s7MHm+KhBlSNYdf2XLdo8GpkECoUHvSwLnvVW+tHiu4hCmwcS7OZhmnquKEo6vH9o8IkkxToDk1Q6FFmT26LY6VNOPAjqY6O7R7Vfc33r4hcVdiCiwGav2wbTEX9nLJEwITKOGSUt4NHsQbSWtsxt2Q4Qjsnv89SW19TRiuEpyenVbnrX1q45wHNyHEfkfplhLEI5ZFs9s6ACoARyzXdkRr2bjyTL8CF747EuyBKFVds9uL6qBmcHgBA3NIoOhxr5+bu8TSxe6MyvxoXFSiLSErfqCXy7I2glXK+2NPIF5OyKcBN7Bl9oG5t9zzbZwGY6PVwV0MHPDh7sewf2I0uf/M8+pKOx7JbcF0+J6VXtrm9uGVuM8riK0RGVUWAt/fUHOvdcu08ovF6CDRGuelH8cmv/+49jVMRNnb3UIpsXnGl5DV5uoNm7i0w8xqoXQfYJCirN3rZlbYx1orkgToIPUc3tfvwJ2TaeKZz7sgU2aExF/7EbQYEajr8zuCxfMraHDcW3HV0vAjzHGk37BXVy3z1907YKPCtCMhxrj/MU/Xlr8VlZl1RCMethG0BivFrOqhmZ8qzyMTKApGiCVjwWZFuY+dMPPRoggakAHK2/coxtma4TRlXPff1ZSlMnouH8G0wy/qmFC47kchvOmRtDGcyjpbIJSHJDzwA26dsEfT1g3oIME3X2u/w333Q/NeftxqG/mv74gmGbUVe3Dm+b2crAnUeXtYpSArEGnDIvIJuTszZ9NivXtgg/VXMMAHSSe2/fmr2BfX3hozrt3gvFJ6MaTg2m6r2qnPJl+9Zra60W3ZKxAgC1Uni+4XCNz+1yve4dPuAYbDmhWUY19/cruue+/neWtKsL5bYMQjVVmOoOaklNbWqqzWru4bgksFRVamF1w64+6KUUgEBNAB+ts2uf66u9tz17E7Gu2M6M7fmqwDBYNfVVb72xqL0fKmhy/wrU7JKfw9rdCd6jxsTEFdJDdzH637MT93eWz3ppDqj/Ec2EwTdNVZSlHYCtZ9I+KJjY0Z2Z8qbkcNRM0fYEjaPKUbEf2dNKvellnvDiOC6CDjckruJWtkLq4fObC7uhgprGbBdO4rirfWGF79MzqGQvgmauBls+EUx7oAidxDckqvOX3RLQmroAONjBvwO2v0pkL3zkky7CtSRHisfaEJ6ioaDPb0CmBaLpKUBwFeUP+8TVPT8YqT0IAzRpDBtxeicv4iimvzZdEOgna9W4WzdLCBhUNjR3h6mXDEo1DQhI7OCD0XXg5Ds8t+ucrkVbCxUHSQtjlo/GonNWRNeSO3XmDu/eCUDrBDv4yYAszbRvqJzPPT5gQKr8dF1qOBsgFfn8vDsWZ5CFS67whd+j+UleYXtUcnTANfTyndYbcsQZxl5aVvPIPODMw4yg0Pz6Pqo1sYhv6hHZYPAJaeRGcSoPqjLxru9maYhpABwVTZ9hdb9Di5/9dnuYcAAAPRXx2MM2q66HxgVM0wbwWUVC+Ea9U+IEKYkH+yDuXR8yboAymAzSTAym+x4vLpIqJLz/vp8oE4KEn/scXu1RAIcOGVklmdBMWGF/Wdtux7zOOzpN/nRftss54yd6UgA42PmvE3cz107us+LknMP3IPiQfPgS2FIVPtlOikoAfEyNz5IyqcXWH9tG1rDOq2qMoZGpAB9tTp/je73B/+b4p83OCcSdcmQ1t2mA9kwMcf+QQxAG5xb2OnvxkWvHWYswSgA7y22BIb3bUQujARu5mDZhuMMkxZRElBCluFEUysE7xPYYu64xYsUEZLAVotTbj9Hbz2tCMcRM/bzVyLcMYZVwd1445sJN1fdxIrZ9inZY0gIbDFZ8mNrHVEeuejJ6+DDfpPJGmjcot6XkgejLmKJk0gCay1JUS5ziI9QH8HOYQr9m5IEvwscrC/Cl9fjI7p7z8JZ1KOzhk7v9hCnY23vGX8Qoh9vnIMlPxQ4St4Kcov+SBt2Pf9vjWkHSADoqvbPDT/8CO9Kn4X9uKviABY69sIqKLsSSjonYEqxIn53ulGeTxfoHTOqOiYuJCSQtoJvPdxXMz049IQzFaZDtmEnbiE+pONKCZl/5lRXIMazCrT0KWdbL+iEdIakAHBVhW+ORpikhn4P8bgnFxvRIAmiZMQ3+FRbcF9Wc8/E1c25ygylIC0EHZHih84q/YBTAL/7cOxsXlSoQvAOhL41LXn5Xsgu97WP7Mvgss4wT/k/eo71IK0ExK9P65roNuXz908ij8mxu15LQV/ALZ4wXoSngvZ/g8aZObPtOH7elMqZBygA727p7+sxrBZzmZUNIDcbGWQ7wA/SaRHUX5T/TdHmxnql1j3ZGml+f+h2d1qnHzdYwhszEFNDrxe0yOFOY/UWDaZZ0xlO0xpFMe0EwaWK9MDj702L24YwdPNjxGQsb8swJkLjGG1DFU9uLlMqpuw7L5Zl/WeQzXMfzHBnQt4R68f3IeFdPGwADpi2gjZ1GNBrQfD99swekaX+/xfodqNSHlb21Ah4DAgT7Tz0I084ZcFSI5iigCQFNDNDT8yR8qgjKg4dyiLVEwkvRFbECrdPH++6bfTAidjiwtVLLxJH2JTBfzZFTJsxHmxYB68wYtVsmT8kliyktARQD1nx309iF/xVkwssdiyrgSP+yk0v5jZ0xGU66mTCmqLKh7UsU5NphVOqsmydbQkWUUyFHae3pzmcrT8M+tnEWOZoOZsBKCvuhoBN+NjGzPpEny6NyXhlt+WSdfk/XnsgGtUYb77ym5Al9pewwDx7a8RTUDmgqfiU6lsO78Eet467DzVUvABnQUSKCXFzsPNE97CEWL8asTkQQlX2EamueAyq3wIA6u/9KwdyLStDOElIAN6JBi4Yv8/Z5JDdIkZSK2DMKHrXoK1VdIVwP0YexqLynNy360ZZIu6+STqP5cNqD1y1DY133c+UQUZ8O0CAlaDO5WYTayc4iq2KaxFxWnNLzhC8V/hEi3ozRKwAa0RoGFy85mGw90H383QD0ZeRrXzoe4VRD08YBeKShiQYPXR7Ij0OxgkARsQBskyCCZfX+fkiNkeUfB1dcfcS4WD0CvhqA71eTZCX/y0Aavj3615n/7YqAEbEAbKMzapPbdNr6VIMrsa2BX1wC6HTwj0/wVwtSm7xen3LLO2rKJ5b0N6FhKF7T33T76ekLFTpjBeqbum8W/xbi6lCf//wfwv5oLyY34AAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img6"></image><clipPath id="clip7"><rect x="1824" y="5560" width="180" height="207"/></clipPath><linearGradient x1="2.53282" y1="25.2686" x2="2.53282" y2="-99.4357" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill8"><stop offset="0" stop-color="#E73768" stop-opacity="1"/><stop offset="0.5" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="1" stop-color="#69E0F9" stop-opacity="1"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-1237 -5367)"><g clip-path="url(#clip1)"><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6046)">A_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6046)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6046)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6046)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6046)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6046)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6046)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6046)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6046)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6046)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6111)">B_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6111)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6111)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6111)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6111)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6111)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6111)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6111)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6111)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6111)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6176)">C_local</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6176)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6176)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6176)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6176)">alloc_fragment</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6176)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6176)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 6176)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 6176)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 6176)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5843.05 6176)">accum_dtype</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6158.16 6176)">)</text><rect x="4456.5" y="5986.5" width="1769" height="211" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4370.5" y="5690.5" width="1855" height="193" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5429)">import</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4468.05 5429)">tilelang.language</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5429)">as</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5429)">T</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5560)">def</text><text fill="#0000FF" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5560)">Matmul</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5560)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5560)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 5560)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 5560)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5442.01 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5642.53 5560)">):</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5732)">with</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5732)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5732)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5732)">Kernel</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 5732)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5797)">(N,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 5797)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 5797)">),</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 5797)">(M,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5797)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 5797)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 5797)">threads</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 5797)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5928.99 5797)">128</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5862)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4439.41 5862)">as</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5862)">(bx,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5862)">by):</text><rect x="6180.5" y="5981.5" width="45" height="143" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6267)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 6267)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 6267)">clear</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 6267)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6267)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6267)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6451)">for</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6451)">k</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6451)">in</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6451)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6451)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6451)">Pipelined</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 6451)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6451)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6451)">(K,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 6451)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 6451)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 6451)">num_stages</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5986.28 6451)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6014.93 6451)">3</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6043.58 6451)">):</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6608)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6608)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6608)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6608)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6608)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6608)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 6608)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6608)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6608)">k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6608)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6608)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6608)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6608)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6673)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6673)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6673)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6673)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6673)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6673)">[k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 6673)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5184.2 6673)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6673)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6673)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6673)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6673)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6673)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6844)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6844)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6844)">gemm</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6844)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6844)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6844)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6844)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6844)">)</text><rect x="4370.5" y="5615.5" width="1855" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4917.51 5673)">Kernel Context Initialization</text><rect x="6180.5" y="6124.5" width="45" height="73" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#B4E5A2" fill-opacity="1"/><rect x="4456.5" y="5912.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5109.45 5970)">Buffer Allocation</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6181)">Register</text><rect x="4456.5" y="6224.5" width="1769" height="63" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5431.51 6272)">Initialize Accumulate Buffer with Zero</text><rect x="4456.5" y="6395.5" width="1769" height="523" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6321.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4852.81 6380)">Main Loop with Pipeline Annotation</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 7081)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 7081)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 7081)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 7081)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 7081)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 7081)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 7081)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 7081)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5155.55 7081)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 7081)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 7081)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 7081)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 7081)">])</text><rect x="4456.5" y="7027.5" width="1769" height="84" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6953.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4937.86 7012)">Write Back to Global Memory</text><rect x="4580.5" y="6543.5" width="1645" height="153" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6469.5" width="1645" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4827.91 6527)">Copy Data from Global to Shared Memory</text><rect x="4580.5" y="6786.5" width="1645" height="92" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6713.5" width="1645" height="75" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5311.86 6771)">GEMM</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6054)">Shared</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6114)">Memory</text><rect x="2093" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#73BBBE" fill-opacity="1"/><rect x="3523" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#8ED973" fill-opacity="1"/><text fill="#73BBBE" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2070.16 5779)">Global Memory</text><rect x="2808" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#E57C62" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2785.16 5779)">Shared Memory</text><text fill="#8ED973" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3534.24 5779)">Register Files</text><path d="M4154 5390 4154 7114.23" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="41.25 30.9375" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1620.74 7280)">(a) Efficient GEMM with Multi</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2978.85 7280)">-</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3016.09 7280)">Level Tiling on GPUs</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4290.32 7280)">(b) Describing Tiled GPU GEMM with </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6019.39 7280)">TileLang</text><g transform="matrix(0.000360892 0 0 0.000360892 1339 5961)"><g clip-path="url(#clip3)" transform="matrix(1.00011 0 0 1 -0.0235485 0.164795)"><use width="100%" height="100%" xlink:href="#img2" opacity="1" transform="scale(2768.59 2768.59)"></use></g></g><g clip-path="url(#clip4)"><g clip-path="url(#clip5)"><g><path d="M0 0 1013.13 0 1013.13 744.948 0 744.948Z" fill="#0A0619" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1131.64 5290.99)"/><g clip-path="url(#clip7)"><use width="100%" height="100%" xlink:href="#img6" transform="translate(1824 5560)"></use></g><path d="M69.3696-87.6953 2.53282-87.6953 2.53282-65.7044 23.3616-65.7044 23.3616 0 48.3918 0 48.3918-65.7044 69.3696-65.7044ZM90.7347-75.2994C101.849-75.2994 103.727-76.4317 103.727-87.5761 103.727-98.5417 101.849-99.4357 90.7347-99.4357 79.7392-99.4357 77.713-98.5417 77.713-87.5761 77.713-76.4317 79.7392-75.2994 90.7347-75.2994ZM78.7261 0 102.713 0 102.713-68.2372 78.7261-68.2372ZM140.259 0.744948C143.149 0.744948 145.95 0.506565 148.721 0L148.721-18.0575C147.321-17.7 146.934-17.8192 146.069-17.8192 142.136-17.8192 140.885-19.458 140.885-25.1494L140.885-96.0387 116.867-96.0387 116.867-19.458C116.867-5.6914 121.427 0.744948 140.259 0.744948ZM220.624-38.5287C220.624-59.3873 214.038-69.4888 189.783-69.4888 167.166-69.4888 154.025-62.1585 154.025-34.1186 154.025-6.07878 167.166 1.25151 188.651 1.25151 202.149 1.25151 213.025-1.25151 217.197-4.17171L217.197-21.8717C213.145-19.458 202.656-16.9252 193.448-16.9252 184.717-16.9252 179.294-19.3389 177.894-25.2686L219.849-27.8015C220.236-28.8146 220.624-32.9863 220.624-38.5287ZM177.774-41.5681C178.281-50.0307 181.827-51.9378 189.902-51.9378 197.501-51.9378 199.378-48.3918 199.378-43.207ZM258.02-21.6035 258.02-87.6953 232.871-87.6953 232.871 0 290.47 0 290.47-21.6035ZM330.399-69.3696C325.483-69.3696 318.152-69.1014 312.968-68.2372L312.968-48.1534C317.407-48.8984 322.056-49.2858 327.002-49.2858 336.985-49.2858 339.518-48.2726 339.875-41.3297L325.87-41.3297C306.025-41.3297 297.056-35.5191 297.056-19.8454 297.056-5.18484 306.025 1.25151 320.298 1.25151 332.306 1.25151 338.117-2.9202 340.143-6.31716L342.021 0 363.773 0 363.773-46.3656C363.773-62.2777 354.297-69.3696 330.399-69.3696ZM327.896-16.5378C322.95-16.5378 320.298-17.4318 320.298-20.9777 320.298-25.0303 322.712-26.1626 330.28-26.1626L339.875-26.1626 339.875-20.8585C337.998-18.4449 333.826-16.5378 327.896-16.5378ZM424.531-69.4888C413.416-69.4888 406.861-65.9428 403.434-61.2943L403.434-68.2372 379.447-68.2372 379.447 0 403.434 0 403.434-44.9949C404.447-48.3918 407.099-50.5373 413.178-50.5373 420.747-50.5373 422.892-49.1368 422.892-40.6742L422.892 0 446.909 0 446.909-46.1272C446.909-62.665 440.592-69.4888 424.531-69.4888ZM505.015-68.2372 505.015-62.2777C502.363-67.3433 497.208-69.4888 485.557-69.4888 463.954-69.4888 458.411-53.4575 458.411-34.7444 458.411-14.0348 463.954 0 485.557 0 497.059 0 502.363-2.9202 505.015-7.83685L505.015-6.07878C505.015 4.79747 499.592 7.44948 485.051 7.44948 479.121 7.44948 471.403 6.43635 465.98 5.06565L465.98 23.6298C472.416 24.6429 481.147 25.2686 487.583 25.2686 519.944 25.2686 528.794 13.0217 528.794-5.6914L528.794-68.2372ZM493.901-16.5378C484.186-16.5378 482.667-24.8813 482.667-34.7444 482.667-43.9817 484.186-52.2954 493.901-52.2954 505.403-52.2954 506.177-45.7398 506.177-34.7444 506.177-23.1232 505.403-16.5378 493.901-16.5378Z" fill="url(#fill8)" fill-rule="nonzero" transform="matrix(1.00126 0 0 1 1271.86 5696.36)"/><path d="M0.238383-25.0303 0.238383-20.9181 7.80706-20.9181 7.80706 0 12.6343 0 12.6343-20.9181 20.1434-20.9181 20.1434-25.0303ZM22.8252 0 27.6525 0 27.6525-25.0303 22.8252-25.0303ZM32.5393 0 50.7459 0 50.7459-4.70807 37.3964-4.70807 37.3964-25.0303 32.5393-25.0303ZM53.6661 0 72.2004 0 72.2004-4.70807 58.5231-4.70807 58.5231-10.5783 71.0383-10.5783 71.0383-15.0182 58.5231-15.0182 58.5231-20.352 72.2004-20.352 72.2004-25.0303 53.6661-25.0303ZM85.9372 0 104.114 0 104.114-4.70807 90.7943-4.70807 90.7943-25.0303 85.9372-25.0303ZM119.996-25.0303 113.888-25.0303 104.918 0 110.133 0 111.712-4.64848 122.589-4.64848 124.257 0 129.83 0ZM112.964-8.40301 116.778-19.8454 117.136-19.8454 121.248-8.40301ZM132.452 0 137.13 0 137.13-17.0146 137.309-17.0146 149.347 0 154.085 0 154.085-25.0303 149.466-25.0303 149.466-7.80706 149.288-7.80706 137.19-25.0303 132.452-25.0303ZM179.026-3.75454 178.847 0 183.376 0C183.376-0.178788 183.376-13.1707 183.376-13.3495L170.057-13.3495 170.057-9.83331 178.937-9.83331C178.579-6.79393 175.45-3.63535 170.921-3.63535 165.378-3.63535 162.577-7.50908 162.577-12.5151 162.577-17.3722 165.945-21.1267 170.951-21.1267 174.795-21.1267 177.238-19.3389 178.192-16.4783L183.287-16.4783C182.065-22.7358 177.745-25.3878 170.921-25.3878 162.935-25.3878 157.899-20.3818 157.899-12.5151 157.899-4.67827 162.458 0.357575 170.414 0.357575 176.106 0.357575 178.281-2.38383 178.847-3.75454ZM203.579-9.77372C203.579-6.19797 201.196-4.3505 197.769-4.3505 194.223-4.3505 192.226-6.19797 192.226-9.77372L192.226-25.0303 187.34-25.0303 187.34-9.05857C187.34-2.65202 192.197 0.357575 197.799 0.357575 203.609 0.357575 208.317-2.65202 208.317-9.05857L208.317-25.0303 203.579-25.0303ZM224.885-25.0303 218.776-25.0303 209.777 0 215.022 0 216.571-4.64848 227.448-4.64848 229.146 0 234.718 0ZM217.823-8.40301 221.667-19.8454 222.024-19.8454 226.136-8.40301ZM256.262-3.75454 256.083 0 260.613 0C260.613-0.178788 260.613-13.1707 260.613-13.3495L247.293-13.3495 247.293-9.83331 256.173-9.83331C255.815-6.79393 252.686-3.63535 248.157-3.63535 242.615-3.63535 239.814-7.50908 239.814-12.5151 239.814-17.3722 243.181-21.1267 248.187-21.1267 252.031-21.1267 254.474-19.3389 255.428-16.4783L260.523-16.4783C259.302-22.7358 254.981-25.3878 248.157-25.3878 240.171-25.3878 235.135-20.3818 235.135-12.5151 235.135-4.67827 239.695 0.357575 247.651 0.357575 253.342 0.357575 255.517-2.38383 256.083-3.75454ZM264.784 0 283.319 0 283.319-4.70807 269.641-4.70807 269.641-10.5783 282.157-10.5783 282.157-15.0182 269.641-15.0182 269.641-20.352 283.319-20.352 283.319-25.0303 264.784-25.0303Z" fill="#FFFFFF" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1395.89 5764.9)"/></g></g></g></g></g></svg>
\ No newline at end of file
+<svg width="5243" height="2012" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><clipPath id="clip1"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><image width="2741" height="1197" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAACrUAAAStCAYAAADgNkLIAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAACrWgAwAEAAAAAQAABK0AAAAAJ6/OxQAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHs3X2MXld5IPDr1F84W4aRUmeAxiATgSNjy+tptRPKpPHYwjOKZY+lick6iWOtE3nWbmviRTukWzSkKU0olI9q4Y9GSC0UVKVUabpICSsIu1A14Y8EiSwCKmBVoLCEaEOgiQp0yd4z5J3c97wf887M+77363cl79xz7r3nPOf3jNVU+/RxkrgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI5CzwSznvb3sCBAgQIEBg/QKz6RJb0z//Z/1LWYEAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gS+mH52/9o+9RUBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB9QuELq3Pv/Bn3/qXswIBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB1Qt8Nv2kUdSqW+vq/XxBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQEIENBYlDGAQIECBAgMDqBUJn1i9mP/ulK65KNmzZlp1yT4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKAUApeUIkpBEiBAgAABAu0EFuPJn//f78VTxgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRKIaCotRRpEiQBAgQIEGgRCF1aZ+PZ55/9YfL8T56Lp40JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFF5gY+EjFCABAgQIECDQTuBsu8kw9/++/ZVOj8wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJTAxivHl+PRqXWZwg0BAgQIECiNwFga6enSRCtQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAj0IKGrtAckrBAgQIECgYAILaTxbCxaTcAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisS0BR67r4fEyAAAECBIYuELq0zg99VxsSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGLCAotYBA1ueAAECBAj0WeCGdL2mLq2bt2xKRreP9HkbyxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYroCi1uF6240AAQIECKxHIBSzLsQLzJyaSubOHYmnjQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUSkBRa6nSJVgCBAgQqLnAfHr+saxB6NIaClpDYaturVkZ9wQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmUTUNRatoyJlwABAgTqLNC2S2soZm0Ut9YZx9kJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTKLaCotdz5Ez0BAgQI1EdgNj1qU5fWcPSD108uC+jWukzhhgABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoIQCilpLmDQhEyBAgEAtBRbjU09Mjyc7d+9YntatdZnCDQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQAkFFLWWMGlCJkCAAIHaCYQurfviU5+8eDyeSnRrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUBIBRa0lSZQwCRAgQKDWAhfi048f2NvUpbXxXLfWhoSfBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECZRNQ1Fq2jImXAAECBOomEDq0XhsfeuamA/HU8li31mUKNwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAiUSUNRaomQJlQABAgRqKbAYn3rn7h3JxPR4PL081q11mcINAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAiQQUtZYoWUIlQIAAgdoJhC6ts/GpT148Hk+1jHVrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUHABRa0FT5DwCBAgQKDWAmfj019+xWVdu7Q23tettSHhJwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQFkEFLWWJVPiJECAAIG6CYylBz4dH/romcPxVMexbq0daTwgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAooICi1gImRUgECBAgQCAVWEj/bM1KjG4fSUKhaq+Xbq29SnmPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECAAIFmgdCldb55Kknmzh1JQqHqai7dWlej5V0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIE8BRS15qlvbwIECBAg0F7ghnS6qUtrKGY99ObJ9m93mdWttQuORwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAoUSUNRaqHQIhgABAgQILBWzLsQOoePqpS/dFk/3NNattScmLxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQsoKg15wTYngABAgQIRALz6XgsO7febqvr/T4bi3sCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECgxJQ1DooWesSIECAAIG1CbTt0jq6fWRtq73wlW6t6+LzMQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBAEFLUOAdkWBAgQIECgR4HZ9L2mLq3hu4PXT/b4eefXdGvtbOMJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAMQQUtRYjD6IgQIAAAQJBYDFmmJgeT3bu3hFPr2msW+ua2HxEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAwJAFFrUOCtg0BAgQIEFhBIHRp3Re/c/Li8XhqzePQrfXomcNr/t6HBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYpoKh1kLrWJkCAAAECvQtciF8dP7C3b11aG2sfOjGZhOJWFwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGiCShqLVpGxEOAAAECdRQIHVqvjQ8+c9OBeGrd49HtI8nMqal1r2MBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAv0WUNTab1HrESBAgACB1Qssxp/s3L0jmZgej6f7Mp47d0S31r5IWoQAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCfAopa+6lpLQIECBAgsHqB0KV1Nv7s5MXj8VTfxrq19o3SQgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAn0UUNTaR0xLESBAgACBNQicjb+5/IrLBtaltbGXbq0NCT8JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKIqCotSiZEAcBAgQI1FFgLD306fjgR88cjqf6Ptatte+kFiRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFingKLWdQL6nAABAgQIrENgIf12a/b7YRab6taalXdPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECCQt4Ci1rwzYH8CBAgQqKtA6NI6Hx9+mIWmwyygjc9pTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCAWUNQaixgTIECAAIHhCNyQbtPUpXXzlk3JoTdPDmf3F3YZZhHtUA9mMwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgdIJKGotXcoETIAAAQIVEAjFrAvxOWZOTSWXvnRbPD3QsW6tA+W1OAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCoEFLWuAsurBAgQIECgTwLz6Tpj2bVCl9bQNTWPS7fWPNTtSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgEAsoao1FjAkQIECAwOAFWrq0HjwxmYSuqXlcurXmoW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBWEBRayxiTIAAAQIEBiswmy7f1KU1bDdz89Rgd11hdd1aVwDymAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOACiloHTmwDAgQIECDQJLDYNEoHE9Pjyc7dO+LpoY51ax0qt80IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaCChqbYNiigABAgQIDEggdGndF6998uLxeCqXsW6tubDblAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBA4AUBRa1+FQgQIECAwPAELsRb7bn6qty7tDZi0q21IeEnAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI1FEgdGi9Nj74sVvfFE/lOtatNVd+mxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEai2gqLXW6Xd4AgQIEBiiwGK8187dO5KJ6fF4Otexbq258tucAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBrAUWttU6/wxMgQIDAkARCl9bZeK+TF4/HU4UY69ZaiDQIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQOwFFrbVLuQMTIECAQA4Ct8R7ho6oRevS2ohRt9aGhJ8ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLDFFDUOkxtexEgQIBAHQXG0kPPxwcP3VCLfOnWWuTsiI0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUE2BjdU8llMRIECAAIHCCCykkWzNRlOGTqiNGB+491PZ0N0TIECAAAECBAYi8Pzzzw9kXYsSIECAAAECzQIbNmxonjCqrID/vqpsah2MAAECBAom4L+vCpYQ4RAgQIAAAQKVENCptRJpdAgCBAgQKKjAy9K42nZp3bxlU0FDfjEs3VpftHBHgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAweAFFrYM3tgMBAgQI1FfgdHr0pi6toZj1mmMTpRBpdGstRbCCJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL2AotbSp9ABCBAgQKCgAqGYdSGObebUVBKKRcty6dZalkyJkwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQfoGN5T+CExAgQIAAgUIKzKdRjWUjC11aQ5Foma5Gt9YH7v1UmcIWKwECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJ8Fnn/++T6vaDkCBAgQIECgncCGDRvaTddmTlFrbVLtoAQIECAwZIEL8X4HT0yWqktrI/6D108milobGn4SIECAAAECwxL47tvPDGsr+wxI4BV3fbhlZXltISndhLyWLmU9BSyvPTGV7qV2eS3dIQTcVwH/c7ivnLks1u7vtbzmkoq+biqvfeUszGLyWphU9DWQdnnt6wYWI0CAAAECBAgQWBK4hAMBAgQIECDQd4HZdMVXx6vO3DwVT5VivHP3jmRierwUsQqSAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgvAKKWsubO5ETIECAQHEFFuPQQlFoKA4t63Xy4vGyhi5uAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBkggoai1JooRJgAABAqURCF1a98XRlr0oVLfWOKPGBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC/RbY2O8FrUeAAAECBGoucDY+/56rryp1l9bGeUJh7qMPPdYY+kmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBzgff9w1trLlD+49/+2ve0HEJeW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgVoLhA6t07HAsVvfFE+VcqxbaynTJmgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGkEFLWWJlUCJUCAAIESCCzGMVatEDR0a3URIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGISAotZBqFqTAAECBOooELq0zsYHr1oRaNWKdON8GRMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQnoKg1P3s7EyBAgEC1BG6JjzO6fSSZmB6Pp0s/rlqhbukT4gAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgYoIKGqtSCIdgwABAgRyFRhLd5+PI5g7dySeqsRYt9ZKpNEhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKFE1DUWriUCIgAAQIESiiwkMa8NRt36NI6c2oqO1Wpe91aK5VOhyFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFEJAUWsh0iAIAgQIECixwMvS2Nt2ad28ZVOJj9U9dN1au/t4SoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsHoBRa2rN/MFAQIECBDICpxOB01dWkMx6zXHJrLvVPJet9ZKptWhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQK5CShqzY3exgQIECBQAYFQzLoQn2Pm1FQyun0knq7cWLfWyqXUgQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECuQooas2V3+YECBAgUHKB+TT+sewZQpfWuXNHslOVvtettdLpdTgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwFAFFLUOldtmBAgQIFAxgQvxeQ6emKxFl9bGuXVrbUj4SYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsF4BRa3rFfQ9AQIECNRVYDY9+Kvjw8/cPBVPVX6sW2vlU+yABAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGhCChqHQqzTQgQIECgggKL8ZkmpseT0Lm0bpdurXXLuPMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYjoKh1MK5WJUCAAIFqC4QurfviI9a5Y2mdzx7/HhgTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTUBR69rcfEWAAAEC9RY4Gx9/1/4ra9mlteGgW2tDwk8CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CihqXauc7wgQIECgrgKhQ+t0fPi589fFU7Ub69Zau5Q7MAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgrwKKWvvKaTECBAgQqIHAYnxGXUp/IcIh/s0wJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQWI2AotbVaHmXAAECBOouELq0zsYIOpS+KMLiRQt3BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxNQ1Lo6L28TIECAQL0FbomPP7p9JNl/YG88Xduxbq21Tb2DEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWLaCodd2EFiBAgACBmgiMpeecj886d+5IsnnLpni61mPdWmudfocnQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxZQFHrmul8SIAAAQI1E1hIz7s1e+bQpXXm1FR2yn0qoFurXwMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CChqXYuabwgQIECgbgIvSw/c0qX16JnDurR2+E04dP0bOzwxTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKC9gKLW9i5mCRAgQIBAVuB0Omjq0rp5y6bk0InJ7DvuMwIT0+NLHVszU24JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIdBVQ1NqVx0MCBAgQILBUzLoQO8ycmkpGt4/E08YZgZMXj2dGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0F1DU2t3HUwIECBAgMJ8SjGUZQpfWuXNHslPu2wjo1toGxRQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBHAUWtHWk8IECAAAECSwIXYodrjk3o0hqjdBjr1toBxjQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECLgKLWFhITBAgQIEBgWWA2vXv18uiFm6O3Ho6njDsI6NbaAcY0AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAi4Ci1hYSEwQIECBAYFlgcfnuhRtFmrHIymPdWlc28gYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECBNoLhC6t++JHCjRjkZXHCoFXNvIGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQICAola/AwQIECBAoJPA2fjBrv1XJjt374injXsQUAzcA5JXCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI1F9Cptea/AI5PgAABAm0FQofW6fjJ3Pnr4injHgV0a+0RymsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRoLKGqtcfIdnQABAgQ6CizGT0KH1lCY6Vq7gG6ta7fzJQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgDgKKWuuQZWckQIAAgdUIhC6ts/EHCjJjkdWPdWtdvZkvCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJ1ElDUWqdsOysBAgQI9CJwS/zS6PaRZP+BvfG08RoEFAevAc0nBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGaCChqrUmiHZMAAQIEehIYS9+aj9+cO3ck2bxlUzxtvAYB3VrXgOYTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBNBBS11iTRjkmAAAECPQkspG9tzb4ZurTOnJrKTrlfp4BuresE9DkBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoKICilormljHIkCAAIFVC4Ri1pYurUfPHNalddWU3T/QrbW7j6cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAgRigVDQ2tSldfOWTcmhE5Pxe8Z9ENCttQ+IliBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVExAUWvFEuo4BAgQILAmgVDMuhB/OXNqKhndPhJPG/dBQLfWPiBaggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQMQFFrRVLqOMQIECAwJoEQpfWseyXoUvr3Lkj2Sn3fRbQrbXPoJYjQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJRcQFFryRMofAIECBDoi8DZeJVrjk3o0hqj9HmsW2ufQS1HgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECi5gKLWkidQ+AQIECCwboHZdIVd8SpHbz0cTxkPQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUFIBRa0lTZywCRAgQKBvAovxSjqIxiKDG7MenK2VCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJlE1DUWraMiZcAAQIE+ikQurTuixfUPTQWGeyY92B9rU6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQIDAIgVviRXftvzLZuXtHPG08QAHdWgeIa2kCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQIkEFLWWKFlCJUCAAIG+CoQOraFTa9M1d/66prHBcAR0ax2Os10IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkUWUNRa5OyIjQABAgQGKbAYLx46tIauoa7hC+jWOnxzOxIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEiiagqLVoGREPAQIECAxDYFe6SUuXVt1Ch0HfeY+D1092fugJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA5QUUtVY+xQ5IgAABAm0EzsZzo9tHkv0H9sbTxkMUmDk1lYQ8uAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBOopoKi1nnl3agIECNRZYCw9/HwMMHfuSLJ5y6Z42niIAsE/5MFFgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQTwFFrfXMu1MTIECgzgIL6eG3ZgFCd9DQJdSVv4BurfnnQAQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgbwEFLXmJW9fAgQIEMhDIBSztnRpnblpSpfWPLLRZk/dWtugmCJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1ERAUWtNEu2YBAgQILAkEApam7q0hiLKmZt1aS3S74durUXKhlgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAsMTUNQ6PGs7ESBAgEC+AqGYdSEOQQFlLJL/WLfW/HMgAgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI5CEQurSOZTdWPJnVKNa9YuNi5UM0BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFhCChqHYayPQgQIECgCAJn4yAmpseT0e0j8bRxAQQUHBcgCUIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxZQFHrkMFtR4AAAQK5CMymu+6Kd547fySeMi6QgG6tBUqGUAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECQxBQ1DoEZFsQIECAQO4Ci3EEoUvrzt074mnjAgno1lqgZAiFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIDAEAUWtQ0C2BQECBAjkKhC6tO6LIzh58Xg8ZVxAAd1aC5gUIREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiSgqHVAsJYlQIAAgcII3BJHEjq06tIaqxRzrFtrMfMiKgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAIAQUtQ5C1ZoECBAgUBSB0KE1dGptunRpbeIo/EC31sKnSIAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgb4IKGrtC6NFCBAgQKCgAotxXKFD68T0eDxtXGAB3VoLnByhESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgT6KKCotY+YliJAgACBQgnsSqNp6dI6d/5IoYIUTG8CurX25uQtAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECZBRS1ljl7YidAgACBbgJn44ej20d0aY1RSjLWrbUkiRImAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGAdAopa14HnUwIECBAorMBYGtl8HN3cuSNJKI50lVNAt9Zy5k3UBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFeBRS19irlPQIECBAok8BCGuzWbMChS2soinSVV0C31vLmTuQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgV4EFLX2ouQdAgQIECiTQChmbenSOnPTlC6tZcpih1h1a+0AY5oAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUAEBRa0VSKIjECBAgECTQChoberSGjp8ztysS2uTUkkHurWWNHHCJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0IKCotQckrxAgQIBAaQRCMetCHK3unrFIucfyWe78iZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0ElAUWsnGfMECBAgUEaB0KV1LBu4zp5ZjWrcy2k18ugUBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFYQFFrLGJMgAABAmUWOBsHPzE9noxuH4mnjUsuoFtryRMofAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAGwFFrW1QTBEgQIBAKQVm06h3xZHPnT8STxlXQEC31gok0REIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABApGAotYIxJAAAQIESiuwGEceurTu3L0jnjauiIBurRVJpGMQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBF4QUNTqV4EAAQIEqiAQurTuiw9y8uLxeMq4QgKhW+s1xyYqdCJHIUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUW0BRa73z7/QECBCoisAt8UFCh1ZdWmOV6o3nzh1JQnGriwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPwCilrLn0MnIECAQN0FQofW0Km16dKltYmjsoPR7SPJzKmpyp7PwQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECdRJQ1FqnbDsrAQIEqimwGB8rdGidmB6Pp40rKqBba0UT61gECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA7QQUtdYu5Q5MgACBSgnsSk/T0qV17vyRSh3SYboL6Nba3cdTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBZBBS1liVT4iRAgACBdgJn48lQ4KhLa6xS/bFurdXPsRMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC1RdQ1Fr9HDshAQIEqiowlh5sPj6c4sZYpB5j3VrrkWenJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBKotoKi12vl1OgIECFRZYCE93NbsARU2ZjXqd6+guX45d2ICBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgWgKKWquVT6chQIBAXQRCMWtLl9aZm6aSzVs21cXAOSMBRc0RiCEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGQCilpLljDhEiBAgMCSQChoberSGopZZ26ewlNzAd1aa/4L4PgECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAqQUUtZY6fYInQIBALQVCMetCfPKZU1NJ6NTpqreAbq31zr/TEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAuUWUNRa7vyJngABAnUUCF1ax7IHD11aQ4dOF4EgoFur3wMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDkFFLWWM2+iJkCAQJ0FzsaH339gry6tMUqNx7q11jj5jk6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUWkBRa6nTJ3gCBAjUTmA2PfGu+NQnLx6Pp4xrLqBba81/ARyfAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQqK3AYnzyienxZOfuHfG0cc0FdGut+S+A4xMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKlFFDUWsq0CZoAAQK1FJhOT70vPvnc+eviKWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUWeBsfPjQoXXX/ivjaWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUVSB0aJ2ND3/y4vF4yphAk4BurU0cBgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBQgsoai10egRHgAABAi8ILMYSoUvrxPR4PG1MoElAt9YmDgMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQKEFFLUWOj2CI0CAAIFU4NXpn5YurUdvPQyHQE8CurX2xOQlAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEDuAopac0+BAAgQIEBgBYEL8fPQffOaYxPxtDGBtgK6tbZlMUmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQIJARGEvv5zPjpVudN2MR45UE/M6sJOQ5AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCB/AUWt+edABAQIECDQWWAhfbQ1+1jXzayG+14F/N70KuU9AgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEB+Aopa87O3MwECBAh0FwjFrKfjVw6dmEw2b9kUTxsTWFFAt9YVibxAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEMhVQFFrrvw2J0CAAIEuAvPps5dln4di1qNnDmen3BPoWUC31p6pvEiAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQyEVAUWsu7DYlQIAAgRUEQpfWhfidmVNTSShMdBFYq4BurWuV8x0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYPACiloHb2wHAgQIEFi9QOjSOpb9LHRpDQWJLgLrEQhF0RPT4+tZwrcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDElDUOiBYyxIgQIDAugRuib/ef2CvLq0xivGaBObOK45eE5yPCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDFlDUOmBgyxMgQIDAqgVm0y/2xV+dvHg8njImsCaBnbt36Na6JjkfESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQGK6CodbC+VidAgACB1Qssxp+Efy4+FCK6CPRLQJF0vyStQ4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDon4Ci1v5ZWokAAQIE1i8wnS7R0qV17vx161/ZCgQyArq1ZjDcEiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQKIqCotSCJEAYBAgQILAmcjR1C8eGu/VfG08YE1i2gW+u6CS1AgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOirgKLWvnJajAABAgTWIRA6tM7G3ys8jEWM+yWgW2u/JK1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiPgKLW/jhahQABAgTWL7AYL6HoMBYx7reAoul+i1qPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILB2AUWta7fzJQECBAj0T+DV6VItXVqP3nq4fztYiUAbAYXTbVBMESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRyEtiY0762JUCAAAECWYEL2UG4H90+klxzbCKeNibQd4HQrfXRhx7r+7oWJECAAAECBNYn8Iq7Pry+BXxdSAF5LWRa1h2UvK6bsJALyGsh0yIoAusS8Pd6XXyF/VheC5uadQUmr+viK+zH8lrY1AiMAAECBAgQIECgYAI6tRYsIcIhQIBADQXG0jPPx+eeO3ck2bxlUzxtTKDvArq19p3UggQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBNQkoal0Tm48IECBAoI8CC+laW7PrhS6tM6emslPuCQxUIHRrdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQroKg1X3+7EyBAoO4CoZj1dIxw6MSkLq0xivFABXRrHSivxQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECPQkoau2JyUsECBAgMCCB+XTdl2XX3rxlU3L0zOHslHsCQxHQrXUozDYhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHQU2NDxiQcECBAgQGCwAqFL6/9O/4xltzl22+HktnfcmJ1yT2BoAn9w5gPJow89NrT9bESAAAECBAj8QuD5559vofju28+0zJkol8Ar7vpwS8Dy2kJSugl5LV3KegpYXntiKt1L7fK6YYP/L4HSJXKNAfvvqzXCFfyzdn+v/fdVwZPWQ3jy2gNSCV+R1xImrYeQ2+W1bv991e6/MXqg8woBAgQIECCwSoG6/TdG4Nl45fiykk6tyxRuCBAgQGDIAqFLa1NBa+jSOnfuyJDDsB2BFwV0a33Rwh0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYNgCilqHLW4/AgQIEGgI3NK4afzcf2BvMrp9pDH0k8DQBXbu3pFMTL/4f/0z9ABsSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCosYCi1hon39EJECCQo8Bsuve+eH9dMmMR4zwE/B7moW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECeQgsxpuG7pihS6aLQN4CurXmnQH7EyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUV2FjXgzs3AQIECOQmMJ3u3NKlde78dbkFZGMCsUDo1vroQ4/F08YECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIZgff9w1szI7dlFLj9te9pCVteW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgRUFzsZvhM6Yu/ZfGU8bE8hNQLfW3OhtTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNRYQFFrjZPv6AQIEMhBIHRonY33DV0xXQSKJuD3smgZEQ8BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQdQFFrVXPsPMRIECgWAKLcTg6YsYixkUR8LtZlEyIgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKiLgKLWumTaOQkQIJC/wKvTEFq6tM7cPJV/ZCIg0EFAt9YOMKYJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgMQUNQ6AFRLEiBAgEBbgQvx7Oj2keTgicl42phAYQR0ay1MKgRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EBAUWsNkuyIBAgQKIDAWBrDfBzH3LkjyeYtm+JpYwKFEtCttVDpEAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQYQFFrRVOrqMRIECgQAILaSxbs/GELq0zp6ayU+4JFFIgdGvdtf/KQsYmKAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAlQQUtVYpm85CgACBYgqEYtbTcWjXHJvQpTVGMS6swNz56wobm8AIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAlURUNRalUw6BwECBIorMJ+G9rJseJu3bErmzh3JTrknUGiBienxJHRsdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxOQFHr4GytTIAAAQJJErq0LsQQM6emktHtI/G0MYFCC5y8eLzQ8QmOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFB2AUWtZc+g+AkQIFBsgdCldSwOUZfWWMS4DAK6tZYhS2IkQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEyiygqLXM2RM7AQIEii9wSxxiKAzUpTVWMS6LgG6tZcmUOAkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEyCihqLWPWxEyAAIFyCMymYe6LQ1UUGIsYl0lAt9YyZUusBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNkEFLWWLWPiJUCAQHkEFuNQFQTGIsZlFFCYXcasiZkAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAog4Ci1jJkSYwECBAon8B0GnJLl9Zjt76pfCcRMYFIQHF2BGJIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiTgKLWPkFahgABAgSaBM42jdLBzt07kj1XXxVPGxMopYBuraVMm6AJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBggsoai14goRHgACBEgqEDq2zcdyKAGMR4zIL6NZa5uyJnQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiqgKLWomZGXAQIECivwGIceujSGooAXQSqJKBQu0rZdBYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECgOgJj6VFaurTO3DxVnRM6CYEXBHRr9atAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOivgKLW/npajQABAnUXWIgBRrePJAdPTMbTxgQqIaBbayXS6BAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAQQQUtRYkEcIgQIBABQRCl9b5+Bxz544km7dsiqeNCVRCQLfWSqTRIQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGCCChqLUgihEGAAIEKCIQurVuz5whdWmdOTWWn3BOonIBurZVLqQMRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECOQkoas0J3rYECBComEAoZr0hPtM1xyZ0aY1RjCsnoFtr5VLqQAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBOAopac4K3LQECBComMJ+eZyx7ps1bNiVz545kp9wTqKyAbq2VTa2DESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkMUUNQ6RGxbESBAoKICoUvrQny2mVNTyej2kXjamEAlBXRrrWRaHYoAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYsoCi1iGD244AAQIVFDidnqmpS2s4oy6tQcFVJwHdWuuUbWclQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiGgqHUQqtYkQIBAvQTOxscNXSt1aY1VjKsuoFtr1TPsfAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCgBRS1DlrY+gQIEKi2wGx6vH3xEXWsjEWM6yLgd78umXZOAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYBACiloHoWpNAgQI1EdgMT6qbpWxiHGdBPz+1ynbzkqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0W0BRa79FrUeAAIH6CFybHrWlS+uxW99UHwEnJdBGQLfWNiimCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI9CChq7QHJKwQIECDQVuBCPLtz945kz9VXxdPGBGoloFtrrdLtsAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEAfBRS19hHTUgQIEKiRQOjQOhufV4fKWMS4rgL+LtQ1885NgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrEdAUet69HxLgACB+gosxkcPXVpDh0oXAQLJ0t+F0e0jKAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYhoKh1FVheJUCAAIElgbH0/23p0jpz8xQeAgQyAnPnjmRGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYSUNS6kpDnBAgQIBALLMQToSPlwROT8bQxgVoLzJyaSnRrrfWvgMMTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxRQ1LpKMK8TIECg5gKhS+t8bBA6Um7esimeNiZQa4Hwd0K31lr/Cjg8AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsEoBRa2rBPM6AQIEai4QurRuzRqETpShI6WLAIFWAd1aW03MECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6CShq7SRjngABAgRigVDMekM8ec2xCV1aYxRjAi8I6NbqV4EAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0LuAotberbxJgACBugvMpwBjWQQFe1kN9wTaC+jW2t7FLAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgFlDUGosYEyBAgEA7gdCldSF+oFgvFjEm0Cqg+LvVxAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoJ2AotZ2KuYIECBAIBY4nU40dWkNLxw9czj8cBEgsIKAAvAVgDwmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKQCilr9GhAgQIBALwJn45cmpseTy6+4LJ42JkCgjYBurW1QTBEgQIAAAQIECLksBUoAAEAASURBVBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEIgFFrRGIIQECBAi0CMymM/vi2ZMXj8dTxgQIdBHQrbULjkcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVRAUatfAwIECBBYSWAxfiF0ad25e0c8bUyAQBcB3Vq74HhEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEEgFFLX6NSBAgACBbgLXpg9burTO3HSg2zeeESDQQUC31g4wpgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqYCiVr8GBAgQINBN4EL8MHRoHT+wN542JkCgBwHdWntA8goBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQWwFFrbVNvYMTIEBgRYHQoXU2fuvkxePxlDEBAqsQ0K11FVheJUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGoloKi1Vul2WAIECKxKYDF+O3RpnZgej6eNCRBYhYBuravA8ioBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQKwFFrbVKt8MSIECgZ4Gx9M2WLq0Hr5/seQEvEiDQWUC31s42nhAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQL1FVDUWt/cOzkBAgS6CSzED0e3jyShEM9FgMD6BXRrXb+hFQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHqCShqrV5OnYgAAQLrFQhdWufjRebOHUlCIZ6LAIH+COjW2h9HqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLVEVDUWp1cOgkBAgT6JRC6tG7NLnbpS7fp0poFcU+gDwK6tfYB0RIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBApQQUtVYqnQ5DgACBdQuEYtYb4lUOvXlSl9YYxZhAHwR0a+0DoiUIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACByggoaq1MKh2EAAECfRGYT1cZy66km2RWwz2B/gr4+9VfT6sRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC5RZQ1Fru/ImeAAEC/RQIXVoX4gV1koxFjAn0V8Dfsf56Wo0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAor4Ci1vLmTuQECBDot8DpdMGmLq1hg6NnDocfLgIEBiSgW+uAYC1LgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIlE5AUWvpUiZgAgQIDEzgbLzyxPR4cvkVl8XTxgQI9FlAt9Y+g1qOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQ6LvAbLrivnjVkxePx1PGBAgMQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAonYCi1tKlTMAECBAYiMBivGro0rpz94542pgAgQEJHHrzZBKKW10ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAi8KXJvetnRpnbnpwItvuCNAYOACl750WzJzamrg+9iAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBUAUWtRc2MuAgQIDA8gQvxVqFD6/iBvfG0MQECAxaYO3dEt9YBG1ueAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKK6Aotbi5kZkBAgQGIZA6NA6G2908uLxeMqYAIEhCIxuH9GtdQjOtiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKKaCotZh5ERUBAgSGJbAYbxS6tE5Mj8fTxgQIDElAt9YhQduGAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQGJrAWLpTS5fWg9dPDi0AGxEg0CqgW2uriRkCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgHgKKWuuRZ6ckQIBAO4GFeFIxXSxiTCAfAd1a83G3KwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIJCvgKLWfP3tToAAgbwEQpfW+XhzhXSxiDGBfAQUmOfjblcCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgXwFFrfn6250AAQJ5CVxIN96a3fzSl25LZk5NZafcEyCQo4Ai8xzxbU2AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjkIqCoNRd2mxIgQCBXgVDMejqO4NCbJ5PNWzbF08YECOQkoFtrTvC2JUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBHITUNSaG72NCRAgkJvAfLrzWHb3UMwaukK6CBAoloBurcXKh2gIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBwQooah2sr9UJECBQNIHQpXUhDmrm1FQSukK6ii3w7I+eS5545CvLf55+8pliByy6dQvo1rpuQgsQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECJRLYWKJYhUqAAAEC6xe4IV2iqUtrWPLomcOrXvn9t9+bfPq+z6/6u+wHe66+amm4eeumZP9v7kn2vOGqZOfuHdlX3L8g8OhDjyUfvOPPkmwha+iwe8sdJ5Jjt60+f2DLIxC6tT74kYeTn/7kZ+UJWqQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTUIKGpdA5pPCBAgUGKBC3HsE9PjyeVXXBZPD2Ucuo42rsc++6Wl29CZMhTxhe6xoWjTlSTf+fr3kj8696GWosZQ5HjvOz6WXDqyLTl0YhJVRQUa3VofuPdTFT2hYxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEfiFwCQgCBAgQqI3AbHrSffFpT148Hk/lOg6dSEOh5n/8zYUkW/Saa1A5b/5I2qW1W5fOz//tF3KO0PaDFgiF3oq8B61sfQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCBvAUWteWfA/gQIEBiewGK8VejSunP3jni6EOPvf/upZPHG9ySNDq6FCCqnIJ78zlNdd/7+t3/Q9bmH5RdodGst/0mcgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQWWBj50eeECBAgECFBCbSs7R0aT10/RsLfcTQnfSdZz6Q3PmxtyZ7rr6q0LEOMrjX7X9N8uBHH+64xa79V3Z8lseDj//x/cnH33t/x60/+U8f6fjMg84CoVvrgx95uGvX3s5fe0KAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL6ATq3Fz5EICRAg0A+BhXiR0KE1dGot+hUKW//8D/+q6GEONL5rjk107KgbOnjecseJge5v8WII6NZajDyIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEBicgE6tg7O1MgECBIoiEDq0zsbBnLx4PJ7q6/hXr3x5MvorL+u45tM/+GHyna9/r+Pz7IOvPv715Jtf/lbHws7su2W+f/ZHz6Xn/Mfk2WeeS77/7aeSY7cdXjrO5i2bkvf8t8XkE//1k8kTj3w1+VrqsXP3q5JgHApaQ7Gjqx4CurXWI89OSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE6iqgqLWumXduAgTqJLAYHzYUQw66S+vc+SPJoROT8dZN41DE+YkPfjJ54N5PrfhPqj/40YeT8/ecbvq+KoNP3/f55NFPPZ48+tBjy0e6/IrLlotaw2QobD35nwZbiLy8uZvCCjS6tYa/My4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNUELqnagZyHAAECBJoExtLRdNNMOpi5aSqeymV86Uu3LXUafcv7bltx/ye/89SK75TthZ/+5GfJH537UPL+2+9tKmgt2znEO1yB0K01FDm7CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVE1DUWrWMOg8BAgSaBRbS4dbsVKPTY3Yu7/trjk0k4U+36/vf/kG3x6V8FgpaP/fAo6WMXdD5CRTx73B+GnYmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEqiSwsUqHcRYCBAgQaBIIXVrnm2bSQVG7PO65elfXAs8nv722Tq2PffZLydce/0by9A+eSb7z9e81cVx+xWXJ69N9x6/dm4RCwX5c30/jfPShx5Jnf/RcErrLhnG4do2/Jtm0eVPyq1e+PJmYHk+e+Puv9NydNazV7Z+bP3hiMgln6fUKDo+kMf4s7RT7tS9+I/npv/xs+dPNWzclr/u3aaxpJ9BQaNzLuh//4/uXvw83Tzzy1aZxPIjfbzw/dtvhJHTv7eUaVF5Xsm4X46fv+3zyT9/4XvLVx76xFPp//tC5vv0+dbIIf48f/MjDSej26yJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVEVAUWtVMukcBAgQaBW4kE41dWkN/2T5zKmp1jcLMBOKPbtd21dRtPn0k88kodDwob94eLmotN3aTzySLL0Xnu25+qrktjtvTHbu3tHu1RXnQsfVBz/62bSg8ytt383OhzxcOtJb8WZY7J+feS75+HubC0ezm+x5w64Vi09DsWaI8TP3/V3y1ce/nv285T4UjIbrz+++Lxk/sDc5eP0bu3bS7RZby+LpRKf3Q3Fut6LWYeR1JetsjA9+9OGls4S4stcwCk0b3Vq7FTtnY3JPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiDgKLWMmRJjAQIEFi9QChmPR1/FgpaQ0FlEa9GR9NOsV1+xa90etQ0H4pH/+A/fGCpU2rTgxUG4bvfedPvJTM3Ty0Vt/bqFAoa33/x3qRRCLrCNkuPQ9HjTzOFkGGvUCx5zbF/t/R85+5XdS3u7GWP7Dvf/PK3kneeeX/XAt/s+9n7cK7w5xMf/GRy9yd+t69xZfdZ6X7Yee0WT8jfh972Z8sF0d3eHeQz3VoHqWttAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAIA+BS/LY1J4ECBAgMHCB+XSHsewuoXAyFMEV9fpfK/yT9St1cg3nCl0r75i7e9UFrVmT0H1z8cb39LRGKGh969E7V1XQmt2rcX/NsYnk/D2nl7rFho6x3bqVNr7p9WfoynrH3B+uqaA1u0cojA3rhI6vw76GndeVzleEgtYQY6Nb60rxek6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQ6F0gdGldiF8PXVpDEVwRr9D5MnTi7HYdvH6y2+Olrpn3vuNjXd/p9WGI5X2339v19RDzO898oGuxaCgkDkWqu/Zf2XWtT9/3+aWC3K4vreFhKET93XUW+Wa3bRS2hrMP6wo2w8zrSud6/H98KfcOrdkYj545nB26J0CAAAECBAgQIECAAAECBAgQIECAAAECBAoqsGHDhkJF9pKXvCQpWkyFAhIMAQIECBAgkJvAxtx2tjEBAgQIDErghnThpi6tYaOZm6YGtd+61g3dThdvenfX4tCJ6fFk5+4dHfcJa9y72L2gNXR6HT+wN7n0l7ctrfO1L34jeeLvv5J0KtB89KHHktC1debm9m4PfuThJHRBbXeFYtZ/f/F4cv1vvdgZN3Q4DTGGIs1215/ffV8SOrb2s/A4xN/pfCGGUGx78MQbk0YX3J/+y8+Sx//nE8nnHng0CabtrlDY+vhnv5SEnAz6yiOvK51ppd+zlb7v9/PLr7hsKRfh99VFgAABAgQIECBAgAABAgQIECBAgAABAgQIFE8gFI4ePXo02bVrV/Knf/qnydNPP517kAcOHEhuvPHG5K//+q+TBx98MPd4BECAAAECBAgQyAooas1quCdAgEA1BC7ExwgFiI3CxfhZHuNQ4PnNL/9jWlT61eQzf/X5rgWtoUD0ZFog2u364B1/loQ1213h+1vuOJEcu621o+V3vv695P1pR9ZOxakff+/9adHnZBLWiK9PfOiT8dTy+LY7b2wphr30pduSt7zvtuSf0zjbFSCG4tNQ8JothF1ecI03oatopysU+N75F29teRzmg/cdc3+Y5uhbLc/DxOf+9gtNRa13f+KOpvc+c9/fdSzeDS/G7zc+jgt688hrI5ZOP9sVCYfi4E3p78jO1+9YKpr+NyO/KJzutEa/50O+2v1O9Xsf6xEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKrF7juuuuSd73rXckrX/nKZHR0NLn77ruTZ55p31xk9auv/ovf+I3fSO68884k/Ny3b1/yk5/8JHn44YdXv5AvCBAgQIAAAQIDElDUOiBYyxIgQCAngdl0333x3isVhcbv92McikXDn/VcoRD0zo+9tWuX1u9/+6muBX3tCkwbMYVC3//y4QvJ7xz+vbadSUOn0M+khaZxt9YnHvlK2/fDumHN+P3GfuHn3PnrOsYbCnz7WdQabDpdE4f3d3qUBPfgdsfc3W3fCQXJ2WvP1Vdlh0vFyk0T0SB+P3q8NMwjr+3i6DS3lOe0+3G/u+t22q/bfOhiHArXFbZ2U/KMAAECBAgQIECAAAECBAgQIECAAAECBAgMX2B6enqpiPV1r3vd0ua//du/nfz85z9fKnLNo7D1DW94Q3LXXXclk5OTS/GMj48n99xzT/K2t71NYevwfz3sSIAAAQIECHQQuKTDvGkCBAgQKKfAYhx2KHYLRW9lu8I/q373J343CV0wu12fe+DRjo/Dt90KTMOHoTvo3LkjHdf43ANfaHkWOrx2ulaKd+fuV3X6NHmySxFqx4/W+ODBj3b/v7jNO8488torZehm+yf//Q+Wuv/G3WV7XaPf7+VRuN7vM1iPAAECBAgQIECAAAECBAgQIECAAAECBAhUSeDw4cNLxauvf/3rl4+1bdu25MKFC0tFpCMjI8vzw7iZmJhYKmi99tprm7b79V//9aU4Dx482DRvQIAAAQIECBDIS0Cn1rzk7UuAAIH+C0ykS7Z0aT10/Rv7v9MAVwxFodM3H1jqgLk5/SfdV7q+8KnHO76y/9o9HZ9lH4QixXvf8bHs1PL91x7/ehL+yflsLE9+p3MH1Fe+5uXL37a7CeuEQsjQBTa+wj5hvl+FkqEwuFO31m9++VtLnViP3fqmpS6fcSyhW+sn/+kj8fTQxnnktZfDHToxmZy753TT70Mv3w36Hd1aBy1sfQIECBAgQIAAAQIECBDoJrBly5bkta99bbJ58+alzmPZdy+55JLkm9/8ZvL0009np90TIECAAAECBCotcOjQoaUOqHv37m05Zyhsfctb3rI0/653vSv54Q9/2PJOvydC4erv//7vJwcOHEg2bNjQsvyv/dqvLXds/cxnPtPy3AQBAgQIECBAYJgCilqHqW0vAgQIDFZgIV6+UegWzxd1HP5Z+pm0oDUUmWaLSDvFGwpAv5oWnXa69rxhV6dHTfPhn5IP+4Wi0vgKc6GwNcTWuLb98rbGbcvP5378XMtcPPHsM53fuXSk89rxOiuNX5cWCHcqag3fPvHIV5b+hALWq9OOvqEIOHwTimHzvPLKay9nPnjijT39bvayVr/fCd1aH33osX4vaz0CBAgQIECAAAECBAgQILCiwBVXXJH85V/+ZXL55Zcn//qv/7r8fiiYCEWtt912W/I3f/M3y/NuCBAgQIAAAQJVFQj/7TM1NbXU+XTfvpZeNMvH3rp163Jh6z333JM880xrM5Tll9d5Mz4+vlTQGgpt2xW0NpZvFLbecccdSShsff755xuP/CRAgAABAgQIDFVAUetQuW1GgACBgQmE/614Nl69bP8kebbI8tith5O53zrStYDw6R90/1/w75i7OyZZ0zgUhu65+sVPuxV9disiDSuEgs12xbPhWejQ2ksxb3i3l2vu/JHkcw88uuKrz/7oueTT931+6U94OZxvIi1yPXj9ZBIKo4d95ZXXYZ+z3/s1itgVtvZb1noECBAgQIAAAQIECBAgsJLAxo0bk5e//OXJ6Oho21df8pKXtJ03SYAAAQIECBComsDY2Fhy++23J/v371/xaI3C1lA8+u53v3sgne1DHKFD6+HDh7sWtDaCDe+Hd7/whS8kP/7xjxvTfhIgQIAAAQIEhipwyVB3sxkBAgQIDEpgMV44dB8NhYllvEKR5cffe3/yzjMf6FgAGs717I+eHcrxnkyLWrPXzt2vyg6b7kNhbihc7XR1KzL91de8otNna5oPRY4zN0+t+ttQmPvAvZ9KfudNv5ecmbiY3PuOj3Xt+LrqDVb4IK+8rhBWKR6XrZC9FKiCJECAAAECBAgQIECAAIEVBUIhxs9+1vov4DQ+/PnPf9649ZMAAQIECBAgUGmBp556Krn//vuT7373uz2dMxS2hiLYhYWFjv8HQj0t1Oal0Cn2zjvvTGZmZv4/e3cCHlV99n38npkkQAKBJCwJm8iiIIVXwVejVSqCIuojoIBeguJTpCLIIloRAVmE4gbutBWsguhzSW0t1YpY6gJPFX0EnxekQFmqIIYd2YIkJPOe+9AZM2eWTCaznJn5/q8rnXP+55z/8jkRKf64E1agVX/P9u6778prr70mJ07E57/BBVg2XQgggAACCCCAgFCplW8CBBBAIPkFCo0tXGPdRt+hNQ8zWseozbkGatue51/lU6twfrutxAyk7ti4M+QUaz9cL4vmLJUR04cEvK+6qqgBH4pCpycwHKgipgZadc3jnxrhN5Ou98357/j1ezr63Xm15zBqn6MfvUPKT5V7q7DWdGBPwHX54g9EK79WVz23puMHuj9R7zXQWpKtj2qtyfbGWC8CCCCAAAIIIIAAAggggAACCCCAAAIIIIBAKgmUlZXJ4sWLpaKiQh555BFp0aJFtdvzBFv1xsceeywqFVu7du1qBlqvu+66sAOty5cvl8mTJ8v69eurXTM3IIAAAggggAACsRQg1BpLXcZGAAEE4iMw0ZimbtWp9MfY9709waHWPt2k9+DLqy7L71hDrU/f+6KECrdqxdBegy4XDevZqfUd2lMChVp1jSuXrjaDuzcYIVV9F9o2r90ub77wjhHmLTXPrf/TrFVj6dazq7U7KucasC023kdtKq6WGcFYrZ675cvtMmPJ/VFZF4PERkCrtQb73ozNjIyKAAIIIIAAAggggAACCCCAQHILOOrmiKNufTESH8mzEbexVHeFVB7ZnzxrZqUIIIAAAgikiYAGW5csWWKGSTXY2rx59T+pLysrS8aPHy9aLfWJJ56oVbD1Jz/5iUyfPl1uuOGGsMR1zhUrVsjUqVMJtIYlxk0IIIAAAgggEGsBQq2xFmZ8BBBAILYCWqV1pHWKgaOul6w6mdZu251rUPXJt6eZP+Zeq7cGaxqmnPLSOL/L1e2xyyWd/J6JpKOpETi1tu5GAHXYpMFmVVbrNT3fvG6bbB61LdAlvz7dx+SXxsf0nWnlXP3SsOOaFevkU+MzWMDWb4FVOjzVc3XvsWqJfK+x2lM8x6Vaazy1mQsBBBBAAAEEEEAAAQQQQCAVBBz188TZoEDcZSeTZztOlziy6hFqTZ43xkoRQAABBNJMoLy8XF599VUz2Dpz5sywgq116tSRe++915R68skn5dChQzVW69SpkxloHTBgQFjPaqB15cqVZqD1yy+/DOsZbkIAAQQQQAABBGItQKg11sKMjwACCMRWQJOePlVaNRCY6CqtNdmyrveB+aPNYGuw59Z9uF4O7zvirXrquc9TBdVzbv0cMWNITCu8Drrnetn37QFZ/uoH1qnDPs/JzTb2Pyqm66y6GE+4Vau3Rhpw1eq5Nwzv4/c+qs5Tm+NEv9farN0uz1Kt1S5vgnUggAACCCCAAAIIIIAAAggkhYAjQ9ylR6Ri79dJsVxdpFaWdTU/J2nWy0IRQAABBBBIRwENti5evFjcbreEW7G1bt263mDr3Llz5eDBg2HTdezY0Qy03nTTTWE9o4HWDz74QCZPnixr164N6xluQgABBBBAAAEE4iHgjMckzIEAAgggEBMBDbPeYR1ZA63VVbq0PpPoc60sqZVPg7WyU+VGdVH//zPdtGWTYI+Y/d9u+y7k9WhcHP3oHdKjX3FEQ3Xs1l7mGpVqQ+09ooHDfEgDrhpufWPTb+TZ92eZlWdbti+q9ml9HyuXrq72vkhvsMN7jXTtdnnOU63VLuthHQgggAACCCCAAAIIIIAAAgjYWsDhtvXyAi7O4TC6k3DdATdDJwIIIIAAAqkr4KnYOnXqVCkpCf5TC6sKeIKt999/vxQUFFS9FPT43HPPNQOtgweH/5P2PvzwQzPQ+sUXXwQdlwsIIIAAAggggEAiBAi1JkKdORFAAIHoCIw0himsOpSGWQeOur5qV9Ic9xp0Wci1rlr2md/1+g2z/fqqdmz5cnvV06gfrzUqyN52wRhZtWxN2GNrZdbegy+XKS+NkyfffljCCZGGPXgtbtQQpFae/c3Hj5nr0vNQbff24H/wonsM1bTqbqiW6Pcaam3JdE2rtdIQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgsQKeiq2RBFvvu+++aoOtHTp0EB375ptvDnujH3/8sUyZMkU+//zzsJ/hRgQQQAABBBBAIF4CGfGaiHkQQAABBKIqoFVaJ1pH1Cqt1f3oduszdjmvrlrphk83yYmjpVI1MKnHXS7pJHotUNvx1c5A3X59Wnn0xrbD/fo9HSOmD5F+I/p4Ts3P3z//jiyas9SnT+2HTRpsVm7dsfEbKTfGrdq0AmmzVo2rdsXsWE02fLI54PjdenYRrRIbrOm1J40Ksnf/bKLs3XUg4G3B+vXmnGrCxuodqiXyvYZaV7Jd81RrXfOef5XjZNsL60UAAQQQQAABBBBAAAEEEEAg5gLGjwVOquauTKrlslgEEEAAAQTSXeD06dOyePFik+GRRx6RoqLqf3JenTp1ZPz48eYzc+fOlYMHD/oxtm3b1gy03nrrrX7XgnWsWrXKrNC6Zk34RVuCjUU/AggggAACCCAQCwFCrbFQZUwEEEAg9gK3GFP4VGnVKfsOvTL2M8doBg0yaghvx8bgQVStjNqjX7HPCnr0uzhoqFWDnfpMdYHZb7cFrzqqk1mrqW5et80v0Kr3PTB/lBmy1eNQoVG9Huu2ee12eX3eWwGnOXGstNr1adXfszufFTTUGnDgf3fmNWkY6rJsMfyqC/cm4r2GXHSSXtRqrYRak/TlsWwEEEAAAQQQQAABBBBAAIH4CtTNEWeu/nhfZ3znjWg2I4CbVc9Yqiuip3kIAQQQQAABBBIjoBVbFy1aJA6HQ2bOnBlWsLVevXpmsNVt/AUcDbYeOnTIu/g2bdqYgdahQ4eaY3ovhDj4+9//blZo1U8aAggggAACCCBgV4Fk+NMZu9qxLgQQQCCRAuOskxdf090vfGm9x+7nxX26h1ziuo82+F2v7pn5k16pNpgZLPzpmaytEe6s2tZ96L8Ovf63pf8th/cdqXprwo5DVezVoG911VJ14ft27Q+6/lCh1Ooqtb75wjvVOiXivQbdbBJf8FRrTeItsHQEEEAAAQQQQAABBBBAAAEEYi9QdlLMQq25TUQ02Gr7L+MnAdWrL5WnSmNvwwwIIIAAAgggEFUBrdj6yiuvyMMPPywlJaGLrngm9gRb77vvPsnLyzO7W7dubQZab7/99rADrZ9++qlZoXX16tWeoflEAAEEEEAAAQRsKUClVlu+FhaFAAIIhBTob1w933qHVmRM9tbl0o4i84LvQiuvWpuGN/uN6CPLFqywXjLP9+46IA8N+pWMmD5ENPhbtem1RXOWhqxkqaFAa0BUK50GaiuXrhb90vtbtmvuc0vbn7SWnAbZ3j4dV8OfXS7p5O2L5kGocbUy7ezhz8j4eSP89uZZg+4jVNXcc7u189zq96lVarXy7omjgZ103LF9pvgZaaVbj3Ui3qvfRlKkg2qtKfIi2QYCCCCAAAIIIIAAAgggYHMBrTiWrK3y+30i+kVDAAEEEEAAAQTiIOAJtupU4VZszc7Olnvvvdf4izhueeutt+QXv/iFDBs2TJzO8OqYffbZZ2ag9eOPP47DDpkCAQQQQAABBBConQCh1tr58TQCCCCQCIFp1kk1rKkhyWRv5xphSP2x98GqiGoIVQOZLdsX+Wx12KTBotVH9Vqgps/NMkKcGrT0VF09vP/7oPdXHSNQWLhLccegIVp9Vqu1Wiu2BgrkeubRAGqPfhdLr8GXm/v39NfmUyup6rjB5lWv2y4YY8xbbIRLfT03fLo56HO6Jn1HutZQrXvPrrJq2ZqgtwQysr73eL/XoItN8gueaq1r3lub5Dth+QgggAACCCCAAAIIIIAAAgjUXEDDtllZWZKZmSmVlZU1H8AGT5SWBv6LwzZYGktAAAEEEEAAgQgFPMFW/b3KjBkzpKjI97/VBBpWK7ZqsPWqq66S888/X1wuV6Db/PrWrl1rBlo//PBDv2t0IIAAAggggAACdhQg1GrHt8KaEEAAgeACxcYlvyqtvQddFvyJJLqiYUkNtgYLYupWNIxpDbXqc+OfGiEPDZwTNBCrz2rl0FBj6z1VmwYzrdVd9br2aRg0VGiz6jjVHeua9Ov1eW/JjCW/jFpAedhDg6o1iWQPGjZV81DthjuvrrVPvN9rqP0k+zWqtSb7G2T9CCCAAAIIIIAAAggggAACkQrk5OTIuHHjpFu3blJRURHpMAl9bvDgwQmdn8kRQAABBBBAIDYCGmx9+eWXzeqr06dPlxYtWlQ7kVZsveiii6q9z3ODBlofeugh+dvf/ubp4hMBBBBAAAEEELC9AKFW278iFogAAgj4CEz0OTNOPFUYrf3Jet7lko4hg6cb1myWfiP6+G1Pf+T9r96cJLONiqzWKql+N4fRoYHWyS+NC3rnA/NHiVZDXbZgRcggbdABAlzQdU8a+CuZ8+ZDUQm2qomGfZ++d0HU1tjbqNAayN+6HZ170D3Xy++ff8d6qUbn8X6vNVpcEt3s+XWCaq1J9NJYKgIIIIAAAggggAACCCCAQFQEtEprr169pGfPnlEZLxGDOBs28Z/W+NHD+uOH3ccO+l+jBwEEEEAAAQSSRsBTsVUXrBVbmzdvHrW1r1+/XqZOnSrvv/9+1MZkIAQQQAABBBBAIB4CznhMwhwIIIAAAlER0Aqt/a0jaQXGVGpdLu0YcjvrjEqt1h9T73lAA5DPrpglXS7p5OmK6FNDmxpora4a6cDR10svI+QZzabVZDXYGo1grq5LK8pq2Nda3bama85r2lBGP3qHGZIN91mt6Dpi+hDJyc0O95GA98X7vQZcRAp0ptqvFSnwStgCAggggAACCCCAAAIIIIBAHAQ0+Hnq1Kk4zBS7KRwNCsTvq1GhuIraxW5SRkYAAQQQQACBuAl4gq1arXX37t1RmXfDhg1mhdbly5dHZTwGQQABBBBAAAEE4ilApdZ4ajMXAgggUDuBadbHNahYfE13a3dSn59rBFM1TBosuKr9W9ZtCxpc1fDlHCPEudYIv/7t9/8tq5atCctD51RLDWJqBdbq2sqlq2XBtNdEQ6jRbjrm2o/Wi1ZFjUbTUOhvPn7MrIC7atlnpkm469aKtb0GXWaGYyNZiwaENVi7ZsVaY/7N5nsJd+6q88XrvVadM9WOqdaaam+U/SCAAAKpL9D8kZdSf5NpuEPea2q+dN4r7zU1BdhVTQU0PEqLjUDFd1v9BnbUrS+uwrZ+/aE6+PU6lE7yXuO9Ju+7C7Vy3msoneS9xntN3ncXj5VrsPXll182p9Jwa20qtv7jH/+QKVOmyF/+8pd4LJ05EEAAAQQQQACBqAsQao06KQMigAACMREoNEa9xjpy36FXWrvidn6mSullQedr2S6yH4+i4VKtLFpuhFeDtXDG1jCmfml10Q2fbJIdG3eaw2mwUltW3Uw594Iz1Sy69ewiGvwMt2lg9ul7FwS8XYPGU14aH7AyqlZf/Xb7d7J31wFZNGdpyGqsW9Zt94ZaPYHOgBManW07nxXskk+/VrDVL9Pk003m/N9uKzHv2fLldmnasrHkNWkoOp/uo2nLJmEFfH0mCXCi4/W97Urzq+plfScnjp7wdul91bVYvledO1rW1e0jUde1Wuua99YmanrmRQABBBBAAAEEEEAAAQQQQACBSAQqK/yfCtTnfxc9CCCAAAIIIJBEAlWDrZMnT5azzgrvv/94tqh/yeirr76SadOmyZ///GdPN58IIIAAAggggEDSCRBqTbpXxoIRQCBNBSYa+65bde9mUPD2xIVaNfSoX7FoNQmYVjd/Tm62WYE1WhVttcro46PmB5xWK7w++/4ss9JsoBv0nelXl0vErF469uop4gmVWu/XarSepkFfDaNGs0V7vEjWppVDI23Rfq+edcTC2jO2HT6p1mqHt8AaEEAAAQQQQAABBBBAAIHUEygrK5NQlVrLy4P/5eXU04jTjhyOOE3ENAgggAACCCAQTwENti5YsEByc3Nl6tSp0rBh9QVBPOvbv3+/PPPMM/LWW295uvhEAAEEEEAAAQSSUoBQa1K+NhaNAAJpJqBVWkda9zxw1PVBw5PWezmPnoBWfdVga6B2jVE5V0OR4TS9r8cNxfL6vMB/sBBsjnDG5h4EQgn0u/NqqrWGAuIaAggggAACCCCAAAIIIJDiAjk5OXLeeedJVlZWyCBquAwVFRXStm1bc7xgz/zkJz+RXbt2SWZm8D830VCsI4ygpt6j927btk327dsXbEpvv96ve065ZhjQEEAAAQQQQCA1BTTQWlBQUOPNZWRkSJs2bSQvL08OHz5c4+d5AAEEEEAAAQQQsIsAoVa7vAnWgQACCAQXGGdc8qnSqoHI3jdfHvwJrsRMYMOazUHHLj9Vs6ojJ44FDsfqBHlNGgWdhwsI1EZAq/RqxdYdG3fWZhieRQABBBBAAAEEEEAAAQQQSFIBDaD+/ve/l8JC/XvUtW8aMHU6naIhimBtypQpMmnSJDO0Giy4GqrSq3XcyspKufvuu2XRokXWS37nWiX2k08+kVOnTvld83Q4s+qJOI2wbNkPRpcNK6BmZHmWeuZTA60ZwQPCvjdzhgACCCCAAALJJKCVWe+//34ZNWpUjaq06h7z8/NlzJgx5u/L5s6dKwcOHEimrbNWBBBAAAEEEEDAKxD8T5m8t3CAAAIIIJBAAQ2z3mGdv+/tV4r++HVa/AUaNW4YdNK//X619Bp8uTRr1TjoPZ4LGihc+cZqz6nfZ5dLO/r10YFAtARunTBAZg1/JlrDMQ4CCCCAAAIxEXjqn/fHZFwGjZ/Avec86TcZ79WPJOk6Ar3X76YOT7p9sGBfgeaPvOTbYZzxXv1Ikq4j0HvVTWgANTs7W+rUqRO3PWmF1lBVWiNZSKgQbdXxjh07Jg8++GDVLr9jZ+NW4mzUVNynThqZVvuFWl3NO/iuWUOtLpfxVbP/xMO/h30Zk/Es0L+Hea/J+CZ91xzovfLvYV+jZDwL9O9h3msyvknfNQd6r7531O6sQYMGMn78eBk7dqxotdZImoZiR48ebVa212DrwYMHIxmmRs8E+nWsRgNwsy0FeK+2fC21XlSsfx2r9QIZICIB3mtEbDxkc4Ga/YmHzTfD8hBAAIEUFBhp7MmnbIZWaR046voU3GpybEkrXAZre3cdkPtvmCE3DO8j1wYJHmuYdfmrH8iqZWvkxNHAlVr1HesYNARiJVB8TXeqtcYKl3ERQAABBBBAAAEEEEAAAZsLaEXU06dP23yVoZdXUVEhWq01Wq3yyD5xlx6xZaA16B412KpfNAQQQAABBBBICYH69evLuHHj5N5774040OqB0HDsPffcY57OmzePiq0eGD4RQAABBBBAIGkECLUmzatioQggkIYCWqV1onXfWqU1r2nwaqHW+zmPrkD3nl1DhgEP7zsii+YsNb90Zq3amtekkWxety3shQybNJh3HLYWN0YqQLXWSOV4DgEEEEAAAQQQQAABBBBAIOUEyk+J2/iiIYAAAggggAACiRDQQKtWZ73vvvtEK61GoxFsjYYiYyCAAAIIIIBAogQItSZKnnkRQACB6gVuMW7xqdKqj/QadHn1T3JHTAXGP/ULmTTwV0ErrVadXKu36le4rd+IPqJfNARiLUC11lgLMz4CCCCAAAIIIIAAAggggIAdBDIzM+Wiiy6SZs2a2WE5Ea3hj3/8Y0TP8RACCCCAAAII2F8gJydHRo8eLffff780atQoqgvWYKuO7XA4ZO7cuVRsjaougyGAAAIIIIBALAUItcZSl7ERQACB2gmMsz7uCaFZ+zmPr0Dbzq3l2fdnyezhT8uOjTujMrlW3x0xfYj06FcclfEYBIFwBKjWGo4S9yCAAAIIIIAAAggggAACqSWgoYaMjOT+TwMul0ucTmdYL0bDHI8++qhcdtllYd1vx5v0ndEQQAABBBBAIPUEsrOz5e6775YHHnhA8vLywtrgt99+K++++65cffXV0qZNm2qfyc3NNYOteiPB1mq5uAEBBBBAAAEEbCKQ3H9yZRNEloEAAgjEQKC/Meb51nE1gEazh0CzVo3NYOua99bK6/PeijjcqgHZvrddKb0GXy5ZdTLtsTlWkTYCnqB8tMLZaQPHRhFAAAEEEEAAAQQQQACBJBaorKyU0tJSOXXqVFR24Xa7zYCpBmWDBU3Ly8uloqLCrBIWLKCp44TbdA+nT58O63Yd9/jx42Hdy00IIIAAAggggEC8BOrVqye/+IXxkwEnTZL8/Pywpt27d6/Mnj1b/vCHP8jnn38u06dPl5YtW1b7rKdiq/6+6KmnnpL9+/dX+0xNbvhu6vCa3M69NhRo/shLfqvivfqRJF0H7zXpXllYC+a9hsWUdDcFeq9Jt4koL5hQa5RBGQ4BBBCIksA06zjde3YVDUDS7CWgoUD9OnG0VDav3SZfrdlsfG4Pucgul3SUlu2L5Nxu7UXDsTQEEilAtdZE6jM3AggggAACCCCAAAIIIBB/gR07dsigQYMkKytLahIkDbZSDau2bdtWXnjhhaAVxmbNmiXLly+XzMzgf6FX1xIs8Fp1br1H7922bVvVbo4RQAABBBBAAIGkEahbt66MGDFCpkyZUqNAq/6e6uWXXzb/ctKrr75q/t7p4YcfllatWlW7dw223nPPPeYzGmzdt29ftc9wAwIIIIAAAgggkCgBQq2JkmdeBBBAILiAVmj1q9Lad2jP4E9wJeECObnZosFj/aIhkEwCVGtNprfFWhFAIG0FHMaP1nVXpu322TgCCCCAAAIIRFfgxIkT8j//8z9RHVRDEWVlZUHH/Oqrr6I+Z9DJuIAAAggggAACCNhYoE6dOjJ8+HCZOnWqFBQUhLVSrayqFVoXLlzorbavv/davHix+XwkwdZ58+YRbA1Ln5sQQAABBBBAIBECxn8ZoyGAAAII2EzAr0qrVmjV4BkNAQQQiIWAVmulIYAAAgjYV8DZqKk46of3Y+jsuwtWhgACCCCAAAKpLKBVX0NVWQ1VoTWVXdgbAggggAACCCBQVUB/z3THHXeYgdbGjcP7SX4HDhyQOXPmmIHWH374oepw5l8q0mDrzJkzZdeuXT7Xgp3Ur19fRo8eLRMmTJCmTZsGu41+BBBAAAEEEEAgoQKEWhPKz+QIIICAn4BWaO1v7SVwZhXhHAEEoingqdYazTEZCwEEEEAgigJGpVZX4dmi4VYaAggggAACCCCAAAIIIIAAAggggEDyCWigddiwYTJ9+nRp1qxZWBs4dOiQPProo/Lb3/5WTp48GfAZT8VWDbbu3Lkz4D3WTg223nPPPQRbrTCcI4AAAggggIBtBAi12uZVsBAEEEDAFJhodWjWqjFVWq0onCOAQNQFCM9HnZQBEUAAgagLOBu3EmdBi6iPy4AIIIAAAggggEAqC2gF2YYNG6byFtkbAggggAACCNhcQKvW33rrrWagtbCwMKzVHj58WB577DEz0FpaWhryGQ22vvrqq/LII4+EHWzNyckh2BpSlYsIIIAAAgggkEiBjEROztwIIIAAAj4C+v9i/aq03jC8j89NnCCAAAKxEPBUa92xMby/yR2LNTAmAggggED1As68QnFkZErF3q+rv5k7EEAAAQQQQAABBMyqZr/73e9k9erVaCCAAAIIIIAAAnEX0EDrzTffbAZOmzdvHtb8R44ckSeeeELmz58vx48fD+uZU6dOyeLFi8XtdsvDDz8srVu3rvY5DbaOHj3avG/evHmyb9++ap/hBgQQQAABBBBAIB4ChFrjocwcCCCAQHgCWqW1btVb85o2lL63X1m1i2MEEEAgZgJarXXW8GdiNj4DI4AAAghER8DRoEBcLiPYWrJdxF0ZnUEZBQEEEEAAAQQQSFEB/VG9CxcuTNHdsS0EEEAAAQQQsLNARkaGDBo0SGbNmiUtW7YMa6lHjx6VJ598Up5//vmwA62egT0VW51Op0ydOlVatWrluRT0s379+t5g61NPPSV79+4Nei8XEEAAAQQQQACBeAk44zUR8yCAAAIIhBTQKq0jrXcMHHW9ZNXJtHZzjgACCMREwFOtNSaDMygCCCCAQFQFHNm54mp5rojTFdVxGQwBBBBAAAEEEEAAAQQQQAABBBBAoPYCGmi96aabzEDrWWedFdaAx44dE62Y+txzz4keR9I02Lpo0SKzMuw333wT1hAabL3nnntk/Pjx0qRJk7Ce4SYEEEAAAQQQQCCWAoRaY6nL2AgggED4Ahpo9anSqmHW3jdfHv4I3IkAAghEQUCrtdIQQAABBOwrUFBY5F2co062EWztKI7MLG8fBwgggAACCCCAAAIIIIAAAggggAACiRXIzMyUAQOMn4xmVGg9++yzw1rM8ePHRSulPvPMM3LkyJGwngl2kwZbX3nlFXP+cIOtOTk5ZrD1vvvuk8LCQnE4HMGGpx8BBBBAAAEEEIi5AKHWmBMzAQIIIFCtgIZZ77Le1ff2KyUnN9vazTkCCCAQUwGqtcaUl8ERQACBWgtc1PtauXnML73jOLLqiqvFuaIBVxoCCCCAAAIIIIAAAggggAACCCCAQGIFXC6X/Md//IfMmTNH2rdvH9ZiTpw4IU8//bT59f3334f1THU3lZeXy+LFi2X27NkSbrDVU7FVq7Y2aNCguim4jgACCCCAAAIIxEyAUGvMaBkYAQQQCFtAq7QWVr1bq7QOHHV91S6OEUAAgbgJUK01btRMhAACCEQk8NPr+suwB2dIZta/K7RmZBnB1nPEUTcnovF4CAEEEEAAAQQQQAABBBBAAAEEEEAgOgLZ2dnSo0cPadeuXVgDlpaWyrPPPmtWaT18+HBYz4R7UyQVW+vUqSONGzeWLM+fO4U7GfchgAACCCCAAAJRFCDUGkVMhkIAAQQiENAqrROtz2mV1rymDa3dnCOAAAJxEaBaa1yYmQQBBBColUD3K3rLyFnzpF79f1fNcLrOBFtzGtVqXB5GAAEEEEAAAQQQQAABBBBAAAEEEIhc4Pjx47Jw4UL54x//KBUVFSEHOnnypDz//PMyd+5cOXToUMh7I72oFVsXLVoks2bNqrZiq977X//1X+Z6Dh48GOmUPIcAAggggAACCNRagFBrrQkZAAEEEKiVQH/jaZ8qrTpar0GX12pQHkYAAQRqK0C11toK8jwCCCAQe4EOXS+QsY8/J7n5+WcmczjFVdROnLkFsZ+cGRBAAAEEEEAAAQQQQAABBBBAAAEE/ATcbrd89dVXMnXqVHnrrbfk9OnTfvdohwZa58+fL08++aTEOkCqYdVXXnnFDLZ+/fXXAdej9yxdulQeeeQR2bp1q+g+aAgggAACCCCAQKIECLUmSp55EUAAgTMCflVaqZDItwYCCNhBgF+L7PAWWAMCCCBQvUCLth1kwlMvSkFhkfdmZ9M24sxv7j3nAAEEEEAAAQQQQAABBBBAAAEEEEAgvgL/+Mc/5OGHH5Zly5b5BVt/+OEH+e1vfyuPP/647N+/Py4L03CtBltnz54t1mCrXvvDH/7gDbTGZUFMggACCCCAAAIIhBAg1BoCh0sIIIBAjAW0Suv51jmojmgV4RwBBBIlwK9HiZJnXgQQQKBmAvnNiuSB518WDbh6mjO/SJxNWntO+UQAAQQQQAABBBBAAAEEEEAAAQQQiLPApk2bZNq0afKnP/3JG2zVQOuLL74ojz76qOzbty+uK/IEW2fNmiX/+te/zLkrKirMirIzZ86ULVu2xHU9TIYAAggggAACCAQTINQaTIZ+BBBAIPYC06xTdO/ZVdp2JnxgdeEcAQQSI0C11sS4MysCCCAQiUC9+g1kwtMvSqcLi72POxs2EVdROxEH/9ffi8IBAggggAACCCCAAAIIIIAAAgggEEeBjRs3msHWv/zlL3LkyBGzWqoGWvfu3RvHVfw4lSfYOmfOHNm6dau88847Mn36dNEALg0BBBBAAAEEELCLQIZdFsI6EEAAgTQT0AqtflVa+w7tmWYMbBcBBOwuoNVaZw1/xu7LZH0IIIAAAoZAZlaW3PnwHFn6/BPy2fvvmiaOnEbiat5eKkq2i1RW4IQAAggggAACCCCAAAIIIIAAAgggEGeBf/zjHzJjxgxZuXKlvP3221JSUhLnFfhOp9VZFy9eLLt27ZKdO3eKro+GAAIIIIAAAgjYSYBQq53eBmtBAIF0EvCr0qoVWrUqIg0BBBCwk4CnWuuOjTvttCzWggACCCAQRECDrUMmTJZ6OQ3ko7feMO9y1GsgrhbnSMV320QqyoM8STcCCCCAAAIIIIAAAggggAACCCCAQKwEvvzyS9Evu7RTp07Je++9Z5flsA4EEEAAAQQQQMBHgJ9B6MPBCQIIIBAXAa3Q2t86k1ZDpCGAAAJ2FLjhzj52XBZrQgABBBAIIXDjXWNFvzzNUSdbMlp1FEdWXU8XnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjYToBQq+1eCQtCAIE0EJho3WOzVo2p0mpF4RwBBGwj0KNfseQ1bWib9bAQBBBAAIHwBK4YcLMMvW+yaPVWs2VkiaulEWw1Aq40BBBAAAEEEEAgmgIOh0MyMzODDul08p8iguJwAQEEEEAAAQQQQAABBBBAAAEEEEDAR4A/SfLh4AQBBBCIuUChMYNfldYbhlMFMebyTIAAAhELZNXJlIGjro/4eR5EAAEEEEicwEVXXSt3Pjznx2Cr02UEW88VR3Zu4hbFzAgggAACCCCQcgKnT5+WkpISOXjwoOzdu9f7tW/fPjlw4ICcPHky5fbMhhBAAAEEEEAAAQQQQAABBBBAAAEEYiOQEZthGRUBBBBAIIiAVmn1+ZmvWv2w7+1XBrmd7mQXOHG0VHZs/Ma7jZbtmlPx0qvBQTIJ6K9Tb85/Rw7vO5JMy2atCCCAAAKGQKcLi2XC0y/Ksw+MkZPHj4k4nOIqaieV+3dK5dGDGCGAAAIIIIAAArUW2LVrl9xyyy2SZVSIr6ys9BlPq7Tu2LHDp48TBBBAAAEEEEAAAQQQQAABBBBAAAEEggkQag0mQz8CCCAQfQGt0jrSOqxWP9QqiLTYCTx97wJZuXR17Cb7e1lUAABAAElEQVQwRh7/1AjpPfhynznWvLdWXpj0ik8IUN/1sEmDpd+IwNV5dZ263mDtnd2Lg12iH4GYCniqtS6Y/lpM52FwBBBAAIHYCLRo20EmvvCyPDdxjBzcU2IGW51N24gYlVsrv98Xm0kZFQEEEEAAAQTSRuDUqVOyYcOGtNkvG0UAAQQQQAABBBBAAAEEEEAAAQQQiJ2AM3ZDMzICCCCAgEVAA60+VVo1JNb7Zt8gpOUZTpNU4NttJfL4qPk+gVbdStmpctFQYKxDtknKxrJtLqDVWrW6NA0BBBBAIDkF8psVyb3zXhQNuHqas3Er0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYQINRqh7fAGhBAIB0ENMx6l3WjGhDLyc22dnOeAgKfGlVaNcAarK3+82fBLtGPgG0FPNVabbtAFoYAAgggUK1Abn6+jH38OenQ9QLvvc5GTcXVrI1ZvdXbyQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCRAg1JoAdKZEAIG0FNAqrYVVd044rKpG6h3v+/ZAyE3t3bU/5HUuImBXAaq12vXNsC4EEEAgfIF69RvIyFnzpPsVvb0PORoUiKuoHcFWrwgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACiRAg1JoIdeZEAIF0E9AqrROtm+41+HJ+jLcVJYXOz+1mhEJCtI7d2oe4yiUE7CtAIN++74aVIYAAAjURyMzKkmEPzpCfXtff+5gjO1dcLc8Vcbq8fRwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggEE8BQq3x1GYuBBBIVwFNCvhUaVWIvrddma4eabHvHv2KpW3n1gH3mte0oQybNDjgNToRSAYBqrUmw1tijQgggEB4AjeP+aX0HTrce7OjTrZktD5PHJlZ3j4OEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiXQEa8JmIeBBBAII0F/Kq0Fl/TPWjgMY2dErb1lu2LJK9Jo1rNn9ekoc/zWs3yybenyZvPvyMbPt0sW9ZtM975WaJzaaBVg600BJJVwFOtdcH015J1C6wbAQQQQKCKQN+hP5eCZoWyZO7sM70ZWUbF1o5S8d02cZ8qrXInhwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjEVoBQa2x9GR0BBBDQKq3nWxlunTDA2sV5AgUGjr5eeg++POor0ODfrffxrqMOy4C2ENBqrW/Of0cO7ztii/WwCAQQQACB2glcdNW10iAvXxbOnCTlZWUirkxxtThHKkq2i/vksdoNztMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIhCngDPM+bkMAAQQQiExgmvWxLpd0okqrFYVzBBBIOgFPtdakWzgLRgABBBAIKtDpwmIZ8/jzUq9+gzP3OF3iat5eHPXzgz7DBQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAgj4CmiFVr8qrf3uvNr3Ls4QQACBJBXQaq15TRsm6epZNgIIIIBAIIE2HTvL2Mefk4LCojOXHU5xFZ4tzoZNAt1OHwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRFciI6mgMhgACCCBQVcCvSmvbzq2l+JruVe/hOEUFThwtlWULVgTdXa/Bl0uzVo2DXo/VhbUfrpct67bL4f1H5NttJT7T6Hp+cklH6X5F16gFFXWOT99bK+WnymXLl9ul7Idy75xZdTPl3AvaSWadTOnRrzghHt7FcBCRgKda64Lpr0X0PA8hgAACCIQp4Irv/3Vv0baDjHnsOVkwY5Ls3rHVXKSzSWsRV6ZUHvouzEVzGwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI1F4jvfxmr+fp4AgEEEEhWAa3Q2t+6+FsnDLB2cZ6iAsePlMrr894Kursul3aMW4jz8L4jsnLpanlvyQeyd9eBoGva8KmY9+kNXS7pJCNmDBENYte0aaB31bI18rel/y2b120L+biGbLUtmrNUuvfsKr0GXWYGXEM+xEVbCWi11jfnvyP6fUZDAAEEEIiCgNMljro5xld9cdTTrwZRGLTmQ+Q3KzIrtv56yn3y9eaN5gDO/CJxZGZJxd6vaz4gTyCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQhoAzjHu4BQEEEECg5gITrY/oj+imSqtVhfNYC2z4dJOM/NlEMzQaKtBqXYc+N/bqKfLCg69ImVFlNdy2Y+NO73PVBVqtY2rA9fFR883nNRhLSw4BT7XW5Fgtq0QAAQTsJ+DIqivO3ALRSqiu1p0lo+354mreQcwAaYICrR6levUbyJjHn5dOFxZ7usTRoEBcRe1EHPxxgheFAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgagJ8F+hokbJQAgggIBXoNA48qvSOnDU9d4bOEAgHgLLFqyQSQPnSG0Costf/UCmDXkyrDE0xDpp4K9CVoMNZ98ajNVxarPucObhnugJaLVWDe7TEEAAAQSqF9AqrM68QjMYmnF2VzPI6mzaRpwNm4gGXAO13Px86X5Fb7nxrrFy8VV9A90Ss77MrCy5e9Zcufjqa71zOHIaiavFOSJGVVkaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtEUyIjmYIyFAAIIIGAKaJVWn0SChr009EVDIF4CK5eulgXTX4vKdFq19al7F8iUl8YFHU+DqA8ZAdqaVHUNOphxwRNsffLtaaKVQGn2FvBUa43W95y9d8vqEEAAgRoIGKFPDbE6jIqrns9wnm7RtoO06dRZ2ne5QM42PvObFYXzWEzvGTJhsuQ3LZLlS14y59H9aLC1cs92cZeXxXRuBkcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEgfAUKt6fOu2SkCCMRHQKu0jrROpVVaCeZZVexzvmbFOtm360BEC2rbubUUX9M9omdj9dDhfUdkwbTQgdaW7Yuke8+uktMg21zGli+3y4ZPNgUNpa55b61o1da+twUOZ+u1UIHWjt3aS6/Bl4nOq63sh3JZ9/EGWbVsjeh6AzUNtq77cL3tfAOtlT4xg/tvzn8n6PvECAEEEEgXAUdmljiyG4pWM3Vk51a7ba2E2qajEWDt2s34PM/81D47tr5Dfy65efnyxnNPmMtz1Mk2gq3nSsV3W8Vd9oMdl8yaEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEkkyAUGuSvTCWiwACthfQQKtPlVYNs/boV2z7hafzAjWwqV+RtN6DL7dd6PKFSa/IiaOlAbej34/DJg2WfiP6+F3/dluJPG1UZN28bpvfNe14fd5bRjD18oAB7XUfrQ/4jHZqeHbGkvv9rmv/rRMGyKSBvzIrs/rdYHSs+vNntvMNtE76xPy+0AA/1Vr5bkAAgXQU0HCno37emSBrls9vBf04CgqLzBDrWeeeJx3+TzfRqqzJ1H56XX+pm1Pf+H3BbCkvMyq0ZmSJq2XHM8HWH04k01ZYKwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgA0FCLXa8KWwJAQQSFoBTTDcZV1939uvlLymDa3dnCMQE4G9RsXZUAHdETOGBK22qlVUJ780Tsb2mRKw2qZWVP3b0tUBn9d5g7XiPt2CXZKc3GzRNU0aOCfgPTs2fhOwn057Cuivd1Rrtee7YVUIIBBlAYfTrMLqrP/vaqyuzKATaGhVw6saYj27U2fJb3amannQB5LgQvcrektufoEsnDlJTh4/JuJ0GRVbz5GKku3iLj2aBDtgiQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYVcNp1YawLAQQQSEIBrdJaWHXdWhVTKxfSEIiXwKpla4JO1bFb+4CB1KoPaAA71PfsqmWfVb09rOPlr34Q8r62nc8Ken1fiLBs0Ie4kDABfs1LGD0TI4BAPASM4KqzYRNxFbWTjLb/x/x0NCgQsQRaM7OypOulPWTofZNl1utvy8T5r8iNd40VDYKmQqDVQ92h6wUy9vHnjHBr/pkuI+jrat5BnLmGCQ0BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBCAWo1BohHI8hgAACFgGt0jrR0mf+qHaqtFpVOI+lwGcr1gUdvtsVXYJeq3qhe8+uQX+E/JZ126TsVLn5o+arPtOsVWMJVq11x8adZiXWfndeLcXXdK/6mHms1Vrf2b3Yr5+O5BSgWmtyvjdWjQACgQUcdbLNiqyO+nmix8FaQWGRdOx+sXQyvjTQmi5Nq9BOeOpFeW7iGDm4p8TctrNpGzPoW3l4T7owsE8EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEoChBqjSImQyGAQFoL9Dd271OlVTX63nZlWqOw+fgKHN53RDYbodNgrculHYNd8ulv2b7IDK1qeNXatE+DrV0u6eRz6VyjCmywUKveuOHTTeaXBlgvMYKtGrDVZzQMS0stAU+11gXTX0utjbEbBBBIGwFHVl1x5DYRZ/1GIhlZQfetgc5uP+sl5/3fYtHjdG1affaB51+WZx8YI7t3bDUZnAUtTLvK/TvTlYV9I4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRChAqDVCOB5DAAEELAJ+VVq1ImXbzq0tt3GKQOwEDu8/EnLwSQPnhLwe7kUNr3a5xPfugaOvl1XL1vh2Bjg7cbRUVi5dbX7pZQ216j8rvQZdzj8vAbyStYtqrcn65lg3Amks4HSJs0G+EWZtHLQia2ZWlnS6sFi6XnK5UZW1WHLz89MYzHfr9eo3kAlPvygLZ06STV+c+f2As2ETcbhcUrH3GxF3pe8DnCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQRIBQaxAYuhFAAIEaCGiV1vOt9986YYC1i3ObCox/aoT0Hny5TVcX/rJOHD0R/s21uHOfEWq1Ng1wa2Xi5a9+YL0U8lwDsssWrDC/PAHXG4b3oYJrSDX7X9RqrX2HXimvz3vL/otlhQggkNYCjpxGRpg1Txz1AwdUCwqLjADrxdLJ+Op6aY+0tqpu8xr6vfPhObL0+Sfks/ffNW9XV5czQyr27BCprKhuCK4jgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggIIRa+SZAAAEEai8wzTqE/mh2qrRaVTiPtYAGRBPZRj96h5SfKvdWYa3pWjwB1+WLPxCt/DrwnutFw5G05BTQkPObL7wjZcb3BA0BBBCwk4Ajq65RkbWJGWYVl/+/ZzScqQHWi3r3NSuz2mntdl+L2g2ZMFnq5TSQj956w1yuIztXXC3OkYrd/yTYavcXyPoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAARsIEGq1wUtgCQggkNQCWqHVr0prvzuvTupNsXgEIhXQqrfFfbrJgumvSaQhWw1BaoXPLV9ulxlL7o90KTyXYIG8pg2l7+1XmlV4E7wUpkcAAQREnC4jxJpvhFkbi6NOdkCRNh07y8VXXSvdftZL6tVvEPAeOsMTuPGusZLftJn88bfPmg+oeUbr84xg6xZxl5eFNwh3IYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIpKUAoda0fO1sGgEEoijgV6VVK7QWX9M9ilMwFALhCVRX1VQrCEejNW3VOOQw+v2vX2veWytrVqyTT43PE0dLQz4T6OLaD9fLojlLZdikwYEu05cEAgNHXS9aeZdqrUnwslgiAikq4MhpZFZkddTPD7jD3Px8I8h6nVmVtVmrswLeQ2dkAlcMuFmyjXDwG889IeVlRpA1I0tcrTTY+k9xn6r57wsiWwVPIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIJJsAodZke2OsFwEE7CSgFVr7Wxd064QB1i7OEYiLgFbGDNVGzBgiGrqOV/OEW7V6a6QB12ULVsgNw/tIdXuL156Yp2YCVGutmRd3I4BAdAQcmVniaNjMDLOKKzPgoN2v6C0X9OglXS/tEfA6ndERuMiofNsgL18Wzpx0JthqVMx1tThHKvbsEHfp0ehMwigIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIpJeBMqd2wGQQQQCC+AuOs02mAiyqtVhXO4yXQtGWTkFN9u+27kNdjeVH/udBw6xubfiPPvj/LrL7asn1RtVNqhc+VS1dXex832FdAq7VWV0XYvqtnZQggkEwCjro54ipqJ66zuoizUVOxBlpbtO0gN4/5pTz25nsy7MEZBFrj9HI7XVgsE55+UeoZVVvNpsFW4z0Fq54bp2UxDQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgE0FCLXa9MWwLAQQsL1AobHCW6yr1PAWDYFYChzedyTo8PUbZge9phe2fLk95PV4XdRqsYPuuV5+8/Fj8uTbD1dbPXb39pJ4LY15YiDgqdYag6EZEgEEEDAFHNm5ZvVPV8uO4shp5KOSm58vP72uv0xe8LpMnP+KeewNV/rcyUksBTRQPPGFl6Wg8N9/ocXhFFfh2WfCx7GcmLERQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSSTiAj6VbMghFAAAF7CEw0llG36lIIblXV4DhSgbwmDUM+enj/EdHvtUAtJzdbulzSSTZ8uinQZdnx1c6A/dZOrY56Y9vh1m7v+YjpQ6TfiD7ec51vwyebvedVD7r17CIdu7Wv2uVzrNeefHua3P2zibJ31wGfa56TYP2e63zaX0AD/8sXfyD6vUVDAAEEoiXgzC0QR8Om4qjj/5c6tDroRb37SvcrekdrOsappUB+syK5d96L8uspE2T3jq3maM7GrcyKupUHd9dydB5HAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBIFQFCranyJtkHAgjEU0CrtI60TsiP2LaKcB6JQFbdzJCPbVm3LWRl0x79Lg4aatXw6doP10v3nl1DzvHtttCVUVu2/3eVtX+Psnntdnl93lsBxzxxrDRkqFUf0h9Nf3bns4KGWgMOTGdSCXhC/8sWrEiqdbNYBBCwoYBR4dPZsLFR4bOZSEaW3wIvvvpauaL/YNHKoDT7CWjl3LGPPycLZ06Sreu/NBfozCsUR0bo3//YbyesCAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIFYCzlgNzLgIIIBACgvcYezNp0qrhvJ69CtO4S2ztXgJNG3ZJORUyxauCBn+LO7TPeTz8ye9EvJ5fThYQNUzcFsjgFq1Bascq/doiDac6pz7du2vOqTPcbNWjX3OOUlOAYL/yfneWDUCthFwZYozv7lknN1VzOqeVQKtmVlZcsWAm2X6ojdlyITJBFpt89ICL6Re/QYyctY8nyq6jgYFcv3wsfLDqbLAD9GLAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQNgKEWtPmVbNRBBCIkoCGWcdZx+p7+5VBfyS89V7OEQgloAHOnFz/H6PseUarqI69eopMGjjH+3V43xHPZfP7sN+IPt5z68HeXQfkoUG/kjXvrbVeMsOuj4+aH/Ca5+a2nVv7fa93uaST57Lfp6539vBnpOoarTetXLpadmzcae32np/brZ33mIPkFfBUa03eHbByBBBIhIAjM0ucTVpLRpufGKFWo1K40+Vdhlb97Dt0uMx6/c9y411jRX+8PS05BDSIPOzBGfLT6/p7F7xi1Sdy2aA75Pujx7x9HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCKSfQEb6bZkdI4AAArUSGGk8XVh1BK3SqhUIaQhES+CSa7qLBj2DtRNHS2XDp5u8l62VUIdNGmxWSNVAaaCmwdZZRtBUw7OeqquH938vwe6vOsatEwZUPTWPNYirwdaqa6p6k1Zrve2CMWY145btfANHGz7dHPQ5HUP/+eo1+PKqw3GcxAL6a+XyxR+EVb03ibfJ0hFAIAoCjjrZ4mzUVLSCp7UVFBZJ70FD5aKrrhUNR9KSV+DmMb+U3LwCWb7kJXMT/2/TP41g63/KX373rJzVonnyboyVI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRCxAqDViOh5EAIE0FNAqrROt+9bAXagfv269n3MEqhO45raeIUOt1T2vQdDxT42Qh4xqrtbAa9VnreHYqtcCHXfv2VWKjcBtoDbsoUHVzrdq2ZpAj4bs04Cu7oeWGgKeaq3LFqxIjQ2xCwQQiLqAGWY1KrI6chr5jd2ibQfpPXioz4+t97uJjqQT6Dv051LQrFCWzJ1trn3Ljq+l99CR8ub8J0V+LMybdPtiwQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAApEJOCN7jKcQQACBtBS4xti1T5VWVeh725VpicGmYyfQsVt7GXRP7ar/6hi/enNS1ALXGmid/NK4oJvW+TRIG80Aam8jMN5vRJ+gc3IhOQW0Wms0v0+SU4FVI4CAn4ArU5xNWourVSe/QGunC4tlzOPPy8T5rxBo9YNLjY4LftZb2nTs7N3M199+J3v2H/Cec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJA+AoRa0+dds1MEEKi9wDTrEFq1sm3n1tZuzhGotYBWKB396B2Sk5sd8VgaNH12xSzpckmniMfQBzVYqoHW6oKIPfoVm0Halu2LajWfVvPUvWtIlpZ6Ap5qram3M3aEAAIRCThd4sxvLhltfiLOhk18hrj46mvNIOvds+ZKh64X+FzjJLUEXp83W77evNG7qZcemyZ9elzqPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgfQQy0mer7BQBBBColUB/4+nzrSPcOmGAtYtzBKImoFWAi/t0lzUr1sqGTzfL2g/Xy4mjpTUaXwOEc4yKrfrs337/37Jq2ZqwntcAq4a2NVzbrFXjsJ7RmzRI+5uPHzPWu8mY6zNzvnDXrNVgew26TDQcS0ttAa3WunzxB1J2qjy1N8ruEEAgpICzUVNx5hlF8I0qrVWbBlhvHDlOWrTtULWb4xQVeOO5J2TtRyu9u5t9/z1y+43/4T3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIL0ECLWm1/tmtwggELnAROujWv2SKq1WFXueDxx9vfQafFnQxbVs1zzotUgveMKkwZ5v2/msYJd8+s2qlka4VQOuVduOjTuNgOsJs0vvqa5pYFS/tALqhk82iT6vTcOy2rLqZsq5F7Qzj7v17GKGU82TCP9H//nQL3M+I+B6eN8R+XZbiTnali+3S9OWjSWvSUPRtWtl16Ytm9QoPBvhsnjMJgLm9/XtV8qyBStssiKWgQAC8RRw5DQSV5NWIhlZPtNqiPWGn4+UThfylxt8YFL4ZPmS38nf//In7w4fuOsO0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA+goQak3fd8/OEUAgfAGt0OqXruh359Xhj8CdCRXQ0KR+xbNppVMNdcaqRRqozsnNNiuwahXWeLVYOsRrD8wTfQGqtUbflBERsLuAo26OOAtaiKNeA5+l5ubnyw3/ebdcdNW1Pv2cpLbA5399V5Yvecm7yWE3/YdolVYaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAegsQak3v98/uEUAgPIFp1ts0UBjPUKB1fs4RQACBZBegWmuyv0HWj0D4Ao7MLHE2biVaobVqq1e/gVzRf7D0HjxUMrN8q7ZWvY/j1BNY/8kqWTJ3tndj7tKj8vyMSd5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBNJXgFBr+r57do4AAuEJaJXW/tZbb50wwNrFOQIIIIBADQWo1lpDMG5HINkEXJnizCsUZ6Omfiu/YsDN0nvQUNEqrbT0Etj0xRpZ9OiPf2fM/cMJqSjZLnXrEGxOr+8EdosAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIBBYg1BrYhV4EEEDAIzDOc+D51OqC3Xp29ZzyiQACCCAQoQDVWiOE4zEE7C7gcP4YZnW6fFbb9dIecuNdYyW/WZFPPyfpIbB7x1Z55dHpUl5WZm7YfapUKr7bKuKuTA8AdokAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCtAKHWaom4AQEE0lig0Nj7Ldb9a2XBrDqZ1m7OEUAAAQQiEKBaawRoPIKAjQUcdXPE2bSNOLLq+qyyTcfOcuPIcaKftPQUOLS3RH49ZYKcPH7sDMDpMqncs12ksiI9Qdg1AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQAFCrQFZ6EQAAQRMgYnG//okMjxVBfFBAAEEEIiOgOfX1WULVkRnQEZBAIHECGh11vwis0Jr1QUUFBbJgF+MFa3QSktfgaOHDslzE8eIfprNCLJqhVZ3+ZmKrekrw84RQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSsAoRarSKcI4AAAmcEtErrSCvGDcP7UKXVisI5AgggUEsBqrXWEpDHEUiwQKDqrJlZWXLNkJ/LVTffluDVMX2iBbQyq1ZoPbin5MxS3JVSsfuf4i77IdFLY34EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEbCjhtuCaWhAACCNhB4A5jET5VWrPqZErvwZfbYW2sAQEEEEgpAU+11pTaFJtBIB0EnC5xFrQQV8uO4sj68bdNbTp2lgdeeIVAazp8D1Szx/KyMlk4c5Ls3rH1zJ0aaC3ZLu5TpdU8yWUEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF0FaBSa7q+efaNAAKhBDSVMc56Q9/brxQNXtEQQAABBKIvQLXW6JsyIgKxFHBk54qrWRsRV6Z3Gqqzeik4+LfAokenydb1X3o9KvZ+I+7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwClCp1SrCOQIIICAy0kAorAqhVVo1cEVDAAEEEIiNgP6lgV5Uw44NLqMiEE0BozqrhlldzTv4BFo7XVgs0175A9VZo2md5GO9Nm+2rP9klXcXlQd2ifv4Ie85BwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEEqBSayAV+hBAIJ0FtErrRCtAj37FVGm1onCOAAIIRFmg721XyvJXP4jyqAyHAALREghUnbVe/QZy011j5aKrro3WNIyTAgLLl/xOPnv/Xe9OKg+VSOX3+7znHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQTIBQazAZ+hFAIF0FrjE27lOlVSFuuLNPunqwbwQQQCBuAm07t5bia7rLmvfWxm1OJkIAgTAEtDprk1biaFDgc7NWZx0yYbLk5uf79HOS3gIfvfWGLF/ykhfBfeygVB76znvOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKhBAi1htLhGgIIpKPANOumNWClQSsaAggggEDsBW6dMIBQa+yZmQGBsAWcuQXibNxKxAi2epqGWG/4z7upzuoB4dMrsPajlfLH3z7rPXef+F4q9n7tPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoECLVWJ8R1BBBIJ4H+xmbPt25YA1Y0BBBAAIH4CFCtNT7OzIJAtQIOp7iatvarznrx1dfKjb8YK/XqN6h2CG5IL4FNX6yR1+fN9m7aXXpUKvb8y3vOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALhCBBqDUeJexBAIF0EJlo32rFbe6q0WlE4RwABBGIsQLXWGAMzPALVCDiy6oqzsJ3op6dpddYhEyZLpwuLPV18IuAV2L1jqyycOUnKy8rMPvepUiPQukPEXem9hwMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwhFwhnMT9yCAAAJpIKAVWv1SGgNHX5cGW2eLCCCAgL0EPNVa7bUqVoNAegg46ueLq2VHn0CrVmed/OLrBFrT41ugxrvcu+sbefaBMd5Aq5wuk4rvtolUVtR4LB5AAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAgEqtfA8ggAACZwSmWSEIVVlFOEcAAQTiJ0C11vhZMxMCpoDDKc6CFuJs1NQLkpmVJTeP+aVcdNW13j4OEKgqcGhvifxm6n1y8vixM90V5VKxe4uI8UlDAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIBIBKrVGosYzCCCQagJapbW/dVMaqKIhgAACCCRGgL9YkBh3Zk1PAUdmlrhanOMTaG3W6iyZ8PSLBFrT81sirF1rkHXBjElycE/JmfuNyqxaodVdXhbW89yEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQCABQq2BVOhDAIF0Exhn3XBe04bSrWdXazfnCCCAAAJxFOAvF8QRm6nSVsCRnSuuVueJo26O16DrpT1kwlO/lRZtO3j7OECgqkB5WZn8esp9snvH1jPd7kqpKNku7lOlVW/jGAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEaCxBqrTEZDyCAQIoJFBr7ucW6p4GjrpesOpnWbs4RQAABBOIoQLXWOGIzVVoKOAtaiKu5EVx1urz7v/GusXLnw3OkXv0G3j4OEKgqoIHWhTMnydebN3q7K/b8S9wnj3nPOUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgUgFCrZHK8RwCCKSKwERjI3WrbkartPa9/cqqXRwjgAACCCRIgGqtCYJn2tQWcGWKq8U54szTv9tzpuXm58uEp1+UKwbc7OniE4GAAkuff0I2fbHGe61y39fiPvG995wDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBGojQKi1Nno8iwACyS6gSY6R1k3cMLwPVVqtKJwjgAACCRKgWmuC4Jk2ZQUcdXMko3UncdT7sRJrh64XyOQXX5c2HTun7L7ZWHQE3n75N/LZ++96B6s8VCKVRw96zzlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoLYChFprK8jzCCCQzAJ3GIv3qdKaVSdTeg++PJn3xNoRQACBlBOgWmvKvVI2lCABrczqatlRxKjU6ml9hw6XMY8/L/Xq/xhy9VzjE4GqAn9941XRL0+r/H6fVB76znPKJwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRESDUGhVGBkEAgSQU0DDrOOu6+95+peQ1bWjt5hwBBBBAIIECVGtNID5Tp4aAwymuZm3EWdDCux8Nsd49a670Hfpzbx8HCAQT+Pyv74pWafU09/FDUnlgl+eUTwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkmMNJYb2HVNWuV1oGjrq/axTECCCCAgE0EqNZqkxfBMpJPwOkSV1E7cTQo8K69TcfOMvGFl6XThcXePg4QCCaw6Ys1smTubO9ld+lRqdj7jfecAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAskioFVaJ1oX26NfMVVarSicI4AAAjYRoFqrTV4Ey0guAQ20tjhHHNm53nVffPW1Mubx5yW/WZG3jwMEggl8vXmjLJw5yXvZfapUKkq2i7grvX0cIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBBNAUKt0dRkLAQQSBaBa4yF+lRp1YXfcGefZFk/60QAAQTSUoBqrWn52tl0hAKOzCzJaH2eOOpke0foO3S4DJkwWTKzsrx9HCAQTGD3jq3y6yn3SXlZmXmLu+wHqdj9TwKtwcDoRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQiIoAodaoMDIIAggkmcA063qLr+kuWgWQhgACCCBgXwGqtdr33bAyewlokNXV6jyRjB/Dq0Pvmyx9h/7cXgtlNbYVOLS3xKzQevL4sTNrPF0mlSVbRSorbLtmFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJAaAoRaU+M9sgsEEAhfoL9x6/nW26n+ZxXhHAEEELCnAL9e2/O9sCr7CDiyc8XV4hwRp8tclFZlvfPhOXLRVdfaZ5GsJGIBrZ6qX7FsGmR9buIYObin5Mw0RpC14rut4i4/U7E1lnMzNgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIZECCAAAJpJjDRut+O3dpTpdWKwjkCCCBgUwFPtdY176216QpZFgKJE3DmFoiziVF53nHm7y7Wq99A7p41V9p07Jy4RTFzxALlZWWybf06+XrzP/79uVG0r+/Q4dKibYeIxw31oI7/7ANVAq3uyjOB1rIfQj3GNQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkgoBVai63rHDj6OmsX5wgggAACNhbQaq2EWm38glhaQgScjZqKs3Er79wFhUVmhdZYhR+9E3EQNYG9u76RbzZvlH8ZX19v2hjziqzWhWugdeHMST/Oq4HWku3i/uGE9VbOEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiZAKHWmNEyMAII2FBgmnVNnop/1n7OEUAAAQTsK+D5tZtgq33fESuLr4CzoIU48wq9k2qQdcS0OZLfrMjbx4H9BLau/9IIr34l32z5h1GN9Ss5euhQQhf5+rzZsumLNd41VO7fKe7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCIhwCh1ngoMwcCCNhBQKu09rcuZMfGnXJ9i9ut3ZwjgAACCCCAAAL2F3A4xdW0tTgaFHjX2qZjZ7l71lypV7+Bt4+DxAtoFVQNjJoBViPIqoHWcJr7VKlZKdWRVVcc9WL3Tt947glZ+9FK75IqD+ySyqMHveccIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBAvAUKt8ZJmHgQQSLTAuEQvgPkRQAABBBBAAIGoCWigtaidOLJzvUN2vbSHDHtwhmRmZXn7OEicwN5d35hB1k1rP/epgBp0Re5KM8DqPnnc+NSvEyKVFebtzvzmMQu1Ll/yO/n7X/7kXVbl4T1S+f0+7zkHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMRTgFBrPLWZCwEEEiWgP4/3lkRNzrwIIIAAAggggEBUBQIEWi+++loZMmFyVKdhsJoJeKqxblr7mWw2vg7uKQk9wOkyb3jVDLIaVVnj3T7/67uyfMlL3mndxw5K5cHd3nMOEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIi3AKHWeIszHwIIJEJgojFp3URMzJwIIIAAAggggEBUBQIEWvsOHS59h/48qtMwWHgCh/aWGFVYP5P1n66uthqru+wHcZceFffJY2eqsFaUhzdJjO5a/8kqWTJ3tnd0XVvF3q+95xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkAgBQq2JUGdOBBCIp4BWaR0ZzwmZCwEEEEAAAQQQiJWAq2lrcWTneocn0OqliNvBpi/WyKa1n5sh1r27vgk+r7vyTIhVg6ylR8RdXhb83jhf0T0senSad1YN2laUbPeec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAogQItSZKnnkRQCBeAnuMierFazLmia9ARUXFU06nc3x8Z2U2BBCwo8CAAQPkT3/6kx2XxpoQiJqAq1kbcTQo8I5HoNVLEdOD8rIy0aqmX61ZbX7qebDmrcaqIVYjzGrHtnvHVnnl0eni2Yf7VOmZQKsRwqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkGgBQq2JfgPMjwACCCAQkYDb7W5kfFGFNyI9HkIg9QQmT55MqDX1Xis7qiLgLGjhE2j96XX9pe/Qn1e5g8NoC2iQdYMZZF0tJ48fCzy8jauxBlrwob0l8uspE37cz+kyqdxjVGitrAh0O30IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIxF2AUGvcyZkQAQQQQCBKAnc4HI66URqLYRBAIMkFLrzwQjn//PPlf//3f5N8JywfAX8BZ35zceYVei9cfPW1cvOYX3rPOYiewNebN8pnf31XNny6So4eOhRw4GSoxhpo4bqf5yaO+XFfRpC1YvcWcZcHrzwbaBz6EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIilAKHWWOoyNgIIIIBAzASMKq3jjFBrzMZnYAQQSD6BBx98UG655ZbkWzgrRiCEgLNhE3HmF3nv6HRhsQyZMNl7zkHtBfbu+kY+X7lc1n28Ug7uKQk4oBlkPbpf3Ce+T8oQqFaa1Qqt3v2ZgdZ/JuVeAr4gOhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBFJGgFBryrxKNoIAAgikj4ARaO1v7LZN+uyYnSKAQDgCN910kxQWFsqePXvCuZ17ELC9gDO3QJxNWnvXqYHWOx+e4z3nIHKBQ3tLZP0nq+Tv7y4TDbUGbKfLpPLYIXEfOygaak3WVl5WJgtnTpLdO7ae2YK7Uir27BD3qdJk3RLrRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSGEBQq0p/HLZGgJpJ9Dm/xZmZFQGLq+VdhipveHMDhem9gbZHQIIRC5Qv4VktG8R9vMVJdvNyothP8CNCMRJwJHTSJxN23hna9G2gxlozczK8vZxUDMBDXd+/td35TPj6+vNGwM/XFEulceNaqwaZP3hROB7kqx30aPTZOv6L72rrtjzL3GXHvWec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAnQQItdrpbbAWBBColUBGhntirQbgYQQQQACBtBPQSpgVxo8TpyFgJwFHdq64Cs/2LkkDrWMff04ItHpJanSgVVn/unSJrPv4b3Ly+DH/Z43Kpe7jh8+EWVPs14PX5s02K9J6Nl25fydBfg8GnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAArYUINRqy9fCohBAoMYCba6oK3L0FhFHjR/lAQQQQACB9BXQapiOrLpJ/aPF0/ftpebOHXVzxFXUzvgtjdPcYEFhkRlorVe/QWpuOIa7Wv/JKvn7u8tk0xdrAs7iNgKslccOnwl5GsHWVGvLl/xOPnv/Xe+2Kg+VSOWR/d5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwowChVju+FdaEAAI1FsjMPHaH2+0orPGDPIAAAgggkPYCjtwm4j6wK+0dAEi8gAasXc07+ARaxzz2nBBoDf/daCXWz/76rny87PdycE+J/4Ony6Ty+71SefSgSGWF//UU6fnorTdk+ZKXvLvRMGvloe+85xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYFcBQq12fTOsCwEEaiRQUSF3Oc8UNKvRc9yMAAIIIICAs2HjM2GvFA648ZaTQMDpOhNoNT61aZBVA635zYqSYPGJX+LeXd/Iyt8vkXUfrZTysjK/BblLj5pVSrU6a6q3tYbBH3/7rHeb7uOHpHL/Tu95Mh3ce86TybRc1hqmAO81TKgku635Iz8G6ZNs6Sw3hADvNQQOlxBIUgH+PZykL66aZfNeqwFK0sv8ezhJX1w1y+a9VgPEZQQQQAABBBBAAAEE/i1AqJVvBQQQSHqBjPYXXiHiPt+6keXXnCvnNapr7eYcAQQQQCCNBU5VuKXLHzeIfnqb8WPenbkFRvXGfd4uDhCIt4CrsK1IRpY5rQZaxz5OoDWcd6ABzo//tFS+3rzR/3YjqF557JC4v98j7nL/oKv/A8nfs+mLNfL6vNnejWiYt2LvN95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwuwChVru/IdaHAAJhCFSOM35Or899FxTkEGj1EeEEAQQQQEAF6rgcMrRdY3npn/t9QJyNmhFq9RHhJJ4Czvzm4sjO9U45ZMJD0qJtB+85B74CRw8dks/++hf5eNlS0WNrc5f9IO4j+6Ty6EHj7z1VWi+n7PnuHVtl4cxJ3kq17lOlUrFnR1oZpOzLZWMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQBoJEGpNo5fNVhFIRYG6Hc9vc/q0o791b3d3amLt4hwBBBBAAAFTYEiHAr9Qq1bIdOQ0+v/s3Q10XGd97/v/3vOi99Ho1RKxEtmybMuE1E5wSJMDiF58naQraUgcmuWYkoi1sNNDMZQLuaX09gCltyWFtmlOIb1N4BTCgrAgNCzsA86CNDQBUg4+DZA3Eyex48hvkvUuWdLsffcz8mzN3jOSZo/2vH+ftYT288zez/Pszx4jZ/mn/0glfDQ5b4PiElDvO725097UdXveJ5dd/Ta7z8GigAqwPvbNr8qT33vEDm4uvmplNyeGxRg9K+b0ePJwRRyfOv6q3PuxP1p0mZ+V2IkXRaxqtTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESkmAUGspPS32igACKQLz80GrSmvSR0hbZ7TXBGXn2saUcxlAAAEEEEBACfQ0VMV/Tnz/tVEHiN7YJrHJEccYHQRyKaCFwhJY020vocKs1+0ZsPscLAgsG2aNzcUrsppjZ8Scm61IsuFTg/LFP/uITE9cCPPGA60vEGityHcDN40AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlL4AodbSf4bcAQKVK9DdXy0yfocbYF9fu3uIPgIIIIAAAg6BW7qbxB1qVR//rlXVivrIbhoCORfQdNE7e0X0QHyplo5Ouf2PP57zZUtpgeXCrOrPqTl6Wozxc9bvNxmldFu+7lUFWf+/T/6JDJ0cXJjXqswaG3ypbAK+f/vi/+WrF5PlX+DDG/8mZVGeawpJyQ2ke66v/9n7Su4+2LBT4A2ffsA5YPV4rikkJTeQ7rmW3E2wYV8F+DnsK2dBJkv3c5jnWpBH4eui6Z4rP4d9JS7IZOl+DvNcC/IofF003XP1dYESnQyXEn1wK2yb57oCUIm+zHMt0Qe3wrZ5risA8XJJChBqLcnHxqYRQEAJhIJj+0zRoskaVQFNbuhqSh7iGAEEEEAAgRQBVdF7S7Ranh2ZcbymRVrFPHPMMUYHgVwIBDrWiRa2fj/HaqFwWPZ9+nNSU9+Qi6VKbs6VwqzG8KCYVFWWudlZ+cInPiInjh5ZeMZWuDceaCWYX3LveTaMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCwKECoddGCIwQQKDGBmKG9V9edm961rlnaa/i/NqcKPQQQQACBdAK71jfLp37xuuMlPdIiKjAn1kea0xDIlYAebRetbvH3cnb/8Z/Kmq5LcrVcycw7fGpQDj70oPzi8cfigc3kjavKrIRZF0VUoPWfP/Un8srzv7YH44HW6XG7zwECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShA8qsUnxp7RgABCfZcfq1ostVNsWdDq3uIPgIIIIAAAmkF9vS0yt/96pSMzcYWX1cfCd/QJMbI6cUxjhDwUUCraRC9tcuesf9dvy9X9L/T7lfiQSLM+rMfHEi5fXNmUoxzJ6nM6pJ5+L575Lmf/9QeNU6/IubUmN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFSFSDUWqpPjn0jUOkCmux1E/R3RuIfJe0ep48AAggggEA6gaqAJrd2N8sDL55xvKw3dRBqdYjQ8UtAC4Ul0LHOnq73sm1y894P2v1KOxgbHpbvfvkLkjbMalUcjVdmpfJoytviyQOPiLJLNGPohBhjQ4ku3xFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaQFCrSX9+Ng8ApUpEN54xWbDkJvcd7+7p9k9RB8BBBBAAIFlBQY2taaEWiUQin80vDk5suy1vIiAJwFVBXjNelHvL9VaOjrlvf/3pzxNUS4nz83OymMPf9X6+oqo4+RmEmZN5kh77Ai0WlVsVSVbGgIIIIAAAggggAACCCCAAAIIIIAAAoUSeP3P3leopVnXJ4E3fPqBlJl4rikkJTfAcy25R5bRhnmuGTGV3EnpnmvJ3YTPGybU6jMo0yGAQO4FDEOzqrSajoW66sOyc22jY4wOAggggAACKwmsrVv4+fH910Ydp+rRdokRanWY0FmdQKD9YtGq6+KThMJhK9D6SYk0V94v5Dzz1BPy8H33OCqNKhTCrN7fX+b4kKgqrTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykmAUGs5PU3uBYFKEOjurxYZ2yeiOe72zo2tjj4dBBBAAAEEMhUYsH6GuEOtWk2DaFW1Yp6fynQazkNgSQGtNiJaQ4v9+s1790v35jfa/Uo4OHH0iHz7i38vR5457Lhdc2YyHsxUoVZa5gLm1JjETh/L/ALORAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBEBAi1lsiDYpsIILAgEAqO7TOtOmfJHlUBTW7trrxKZ8kGHCOAAAIIZC9wVXu9bIlWy7MjM45J4tVaT73iGKODgGcBTZfAmm77ssuufptc87s32f1yP5ieGJdHv/RFefJ733HeamxOjOFBMUbPOMfprSiggsCxwZes8rbGiudyAgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlJkCotdSeGPtFoMIFTJG9boI9Pa0SCQfcw/QRQAABBBDIWGDPhlb5+M9fc5yv1TeJnLU+2tsK39EQyFZAb10rEgjFL6+pb5B3f+Cj2U5Vctc9/sg35OBDXxIVbLWbFcQ0Rk6Lce6kiBGzhznITEBVj469foRAa2ZcnIUAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCCAoRaS/ChsWUEKlUg0HOFKmu22X3/t/cufpyv+zX6CCCAAAIIZCKwa12z/N2vT8rp6fnF060Km3qkZSF8tzjKEQIZC2g1DaI3ttnn37L3gxJpLv/q8keeOSwP/8M9cur4q/a9qwNzakyMM6+KOTfrGKeTocD8rBgnrQqthIEzBOM0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBUhQg1FqKT409I1ChApqmWVVarVqtSW3n2kbpaahKGuEQAQQQQAAB7wJVAU1u6GqSB150fhS6Hm2PV5XkY769m1b8FSoU3XaxzdB72Ta5csf1dr8cD4ZPDcq3779XnnnqCcftmbMzYpw9Hg+1Ol6gk7mAMR+v0EogOHMyzkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChNAUKtpfnc2DUCFScQ6t2+1TSNa903fku39dHQNAQQQAABBHwQ2NfXnhJqVR8br9VGxJwc8WEFpqgkAb25U7RwdfyWQ+Gw3P6RPy3r2z/0ja/INAiBxQAAQABJREFU/3zoQZmbTarCalUUNYZfXwiGl/Xd5/7mjJHTuV+EFRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoAgFCrUXwENgCAgisLGCa5nvdZ6kKrapSKw0BBBBAAAE/BNprgvGfK99/bdQxnQonxgi1OkzoLC+gVdWKqvKbaNfePiDNazoT3bL6fur4q/LQ5z4jrzz/a8d9GaNnxBg6IWIFW2kIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBApgKEWjOV4jwEECicQPfWqJWI2CeiOfZwe2+Lo08HAQQQQACB1Qrc1dcm7lCrCiiqL/P81Gqn5/oKEdDbL7H+2qLH77Z78xtlx++/pyzvPF11VnNmUowzx/jzUpZPnJtCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIPcChFpzb8wKCCCwSoFQUL/DFG3h83svzFUV0GRPT+sqZ+ZyBBBAAAEEnALbWupkS7Ranh2ZcbygN62R2MmXHWN0EEgnoDd1xEPQ6rVQOCy//8GPpjutpMfSVmc1DTGGB8U4d7Kk743NI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAYQUWygcVdg+sjgACCCwrYJjafvcJKtCqgq00BBBAAAEE/BZ436a2lCm1OqtoeCCUMs4AAskCWrha9OZOe+id736PXLS+1+6Xw4GqzvrZ/3qHvPL8r+3bUdVZY8efI9Bqi3CAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZChBqzVaO6xBAIC8CgZ4rbtI06XYvNrCJKq1uE/oIIIAAAv4I3HBxk7TXuD7QwPooeb0xNezqz4rMUi4CetvFItZ7RbU1XZfIO9+9p1xuTVR11s9/6P3y3S99UeZmZxfuS1VnHTohsdeeF3PWWd24bG6cG0EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbwKEGrNKzeLIYCAVwFNM1KqtO5c2yhr68Jep+J8BBBAAAEEMhJQlcB3dTennKs3Wr9QcSGwmPIiAxUvoELPWk2D7XD7R/5UQuHy+PsK1Vntx8oBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjkWMBVgirHqzE9Aggg4EEg1Lt9q2ka/e5LdvekBo3c59BHAAEEEEBgNQJ3bmyTB148I+dj5uI0gZBodVExJ4YXxzhCQAlY7w295SLbov9dvy/dm99o90v1QFVnfehzn5FXnv/14i2o6qzDg2KcO7k4xhECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgkwChVp8gmQYBBPwXMAxjr6Y5590SrZb+zohzkB4CCCCAAAI+C7TXBEVVBn/01RHHzHrTGokRanWY0BHRmzqs/wnEKVo6OuWGO/eVPIuqzvo/H3pQ5mZn7XsxZybFOP2KmLMz9hgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgJ8ChFr91GQuBBDwT6B7e4emxe6wPufZMeeu9VRpdYDQQQABBBDImcBdfWtSQq1aVa1o1XWiwn00BOICqkprY6uN8a73f1BC4bDdL7WD6YlxeejzfynPPPXE4tapzrpowRECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORUQM/p7EyOAAIIZCkQCsZuswKt1cmXq6p5e3oWQyPJr3GMAAIIIICA3wKqOrj6crd4VU73IP2KFYi/H7SF/6y6aH2vXHb120rW4sTRI/LZD9zpCLSqAHfs+HNinDtZsvfFxhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoHQECLWWzrNipwhUjkB3f7Upcrf7hm/oapKqgLNyq/sc+ggggAACCPgpoKq1uptWGxGxqnPSENDC1Y4qrdftGShZlCe/9x35/IfeL0MnB+17MEbPSOzEi2LOzthjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FAjmcnLmRgABBLIRCATGr7WqtHa4r93X1+4eoo8AAggggEBOBXaubRRVKfz09PziOlZVTj3aLsbQicUxjipSQG+5yPory8LvCfZetq0kq7TOzc7Kw/fdIz/7wYHFZ2gaEjv1qpgTw4tjHCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5EKBSax6QWQIBBDwLpFRpTYSKPM/EBQgggAACCKxCQFUI393TmjKDHmmxw4wpLzJQEQJaVa1odVH7Xm/et98+LpWDU8dfjVdnTQ60qqqssePPEWgtlYfIPhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoMwFCrWX2QLkdBEpdINS7faumyVXu+xjYmBoocp9DHwEEEEAAgVwI3N7TIirc6miBkMSDrY5BOpUkoDd32rd72dVvk4vW99r9Ujh45qkn5PMf3isnjh6xt6sqs8Zee15UsJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIFEIgWIhFWRMBBBBYSsAwjP1WqNXRtkSr5ar2escYHQQQQAABBPIl0F4TlBsvaZJvHnV+FLsWsX7hYvRMvrbBOkUk4K7Set2egSLa3fJbmZudle9+6Yvy+CPfWDzRNMQYOiHGyOnFMY4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAghQqbUA6CyJAAJLCHRv79A08zb3q3s2UKXVbUIfAQQQQCC/AgMb21IWjAcbaxpSxhkof4FSrdI6Njws//CxDzgDrfOzEjvxIoHW8n/bcocIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIQAodaSeExsEoHKEAgGjTtEtOrku1XV8Xata04e4hgBBBBAAIG8C6iq4dta6lLW1aPtKWMMlLdAqVZpfeX5X8tn3r9b1PdEM6fGZP7Ys2LOTCaG+I4AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUVIBQa0H5WRwBBGyB7n4rzGrut/sXDnZ1N0tVQHMP00cAAQQQQCDvAnf1panWWhcVCYTyvhcWLJxAKVZpfe7nP41XaJ2eGLfhjOFBib1+RMSI2WMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUWINRa6CfA+gggEBcIBCZusqq0diRzqDDrnWk+7jn5HI4RQAABBBDIl0B/Z0RUBXF305scP77cL9MvIwGtNiKaCjJbLRQOy7s/8NGiv7unDx2QL3ziIzI3O7uw1/lZiZ14UYzh14t+72wQAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQqT4BQa+U9c+4YgaIUME3zbvfGlgoPuc+jjwACCCCAQD4E1C9b7OtrT1lKb2iyfi+Dv1anwJThgN5ykX1X1/zuuyTS3Gz3i/Hg4FcflK9+7jP21szzU1ag9QUxpxcrttovcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUgQD/+l4ED4EtIFDpAqHe7Vt1Xba6HdJ9zLP7HPoIIIAAAgjkU+CGriZR4VZHC4REb2x1DNEpPwGtuk60qtr4jakqre+8dU9R3+RDn/+MHPzqA/YezZnJeIVWc+5CxVb7FQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB4hEg1Fo8z4KdIFDBArGUKq1botWyraWugk24dQQQQACBYhRorwnKrnWp1Tm1hpZi3C578lFAb2yzZyvmKq1zs7PyhU98RH72gwP2fs3JkXigVYyYPcYBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUIwChFqL8amwJwQqSaB7e4dpyk3uW76rb417iD4CCCCAAAJFIbBnQ2pVVlXBU6uNFMX+2EQOBPSAaPVN9sRv2XGdfVxMB9MT4/L5D71fnvv5T+1tGaNnJDb4kohp2GMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUqQKi1WJ8M+0KgQgSCwfn9Ilp18u2qKng71zYmD3GMAAIIIIBA0QioauJXtden7Ce5kmfKiwyUtIAesSrxagv/6dS9+Y1y0freoruf4VOD8vkP75UTR4/YezOGTohx5pjd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAodgFCrcX+hNgfAuUs0N1vhVm1O9y3uLunVaoCmnuYPgIIIIAAAkUjMLAxTbXWuqhooXDR7JGN+CegNVih1gvtv/xuSoH5xEsF+66CrJ//8Pvl1PFXF/ZgVWU1Tr8ixrmTBdsTCyOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZCBBqzUaNaxBAwBeBUGjsNivU2pE8mQqz3t6zGBxJfo1jBBBAAAEEikVAVRTvqk8NsGqNa4pli+zDJwGtuk60qtr4bDX1DbLt7e/0aWZ/pnnu5z+Vez/2RzI2PLwwoRVojQ2+JMbYkD8LMAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQRwFCrXnEZikEEHAKxGLafueIiAoJtdcE3cP0EUAAAQQQKDqBO9NUa9UbrQquFz6mvug2zIayEtAb2+zrLn/7/yGhcGqY2T4hzwcq0PrPn/oTmZ4YX1jZiEnstRfEnBrL805YDgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBHgFCrP47MggACHgVCGy6/Stdlq/uyu/qocOc2oY8AAgggUJwCt3Y3i6ow7mhWoDUebHUM0ilZAT0gWn2Tvf3/8rs32ceFPkgEWudmZxe2Mj8rsePPinl+qtBbY30EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgawFCrVnTcSECCKxGwBS52339tpY62RKtdg/TRwABBBBAoCgFIuGA7OmxKrO6mhZZrOzpeoluiQnokRa78u5F63tFfRVDO3H0SLxCqyPQesKq0Dp3IeBaDJtkDwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAWAoRas0DjEgQQWKVA9/YOa4Zr3bPc1UcIyG1CHwEEEECguAV2rW9O2aAWrhatLpoyzkDpCWgNVqj1QiuWKq0q0Hrvx/5ICLQmngzfEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgnAQItZbT0+ReECgRgWDQtKq0ao6SrO01QenvjJTIHbBNBBBAAAEEFgRUhfF0P7/iFT5BKmkBrbpOtKra+D2EwmG5csf1Bb+fRKB1emJ8YS9GTGJUaC34c2EDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgnwChVv8smQkBBDIR6O63wqzGbe5T79zYJlUBzT1MHwEEEEAAgaIX2N2TplqrValVVWylla6A3rhYQV4FWlWwtZAtfaD1RTHnZgu5LdZGAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEfBUg1OorJ5MhgMBKAqHQ+B1WldaO5PNUmHVXd2ogKPkcjhFAAAEEEChWgZ1rG6WnoSple1pkMRSZ8iIDxS2gB0Srb7L3+JYCV2ldMtB6fsreIwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAOQgQai2Hp8g9IFBCArGY7HVv98ZLmqS9Jugepo8AAggggEDJCNze25KyV72xVcQKR9JKT0CPWM9TW/hPpYvW90r35jcW7CaGTw3KFz7xxzI9Mb6wByMmsRNWhVYCrQV7JiyMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5EyDUmjtbZkYAAZdAcMOb+3VdtrqGZWAjlezcJvQRQAABBEpLYE9Pq6jK445mhSLj4UjHIJ1SENDqovY237LjOvs43wcq0PoPd/+RjA0PLyxNoDXfj4D1EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgzwKEWvMMznIIVLaAsd99/1e118uWaLV7mD4CCCCAAAIlJaACrSrY6m56dI17iH6xCwRCotU0xHcZCoflLTuuL8iOE4HWoZODC+ubhsReP0KF1oI8DRZFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF8CRBqzZc06yBQ4QLVm7d2W5/je5ObYWBjagDIfQ59BBBAAAEESkHg9l7rI+vdLRiW5Kqf7pfpF5+AXhexN9W9+Y1SU78QcLUH83AwPTEuX/jER8QRaB18ScyZyTyszhIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA4QQItRbOnpURqCiB+flgSpXW9pqg7FzbWFEO3CwCCCCAQPkK9DRUpf25pje2le9Nl+GdafXN9l1d/vZ32sf5PPjyX/03OXX81YUlVYVWFWidGsvnFlgLAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKIgAodaCsLMoAhUm0N1fLWLe4b7rfX3t7iH6CCCAAAIIlLTA7p7FQGTiRrTaiGhVtYku34tZQA+IVlNv77DvzW+xj/N18O3775Xnfv5Te7nYqVcJtNoaHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQLkLEGot9yfM/SFQBAKh4Ng+axvR5K1UBTS5oaspeYhjBBBAAAEESl6gvzMiW6LW73K4mhZpdY3QLUYBrdaqIK8t/CdS9+Y3SvOazrxu88nvfUcef+Qb9prGuZNiTgzbfQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcg1FruT5j7Q6AIBGKG9l73Nvb0tEp7TdA9TB8BBBBAAIGSF9i1PrVaqx5pEQmESv7eyv0G9Hor1Hqh9b35qsRhXr4feeawfPv+v7fXMidHxBg6Yfc5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpBgFBrJTxl7hGBAgoEey6/Vtdlq3sL6QI/7nPoI4AAAgggUIoC6hc3IuGAc+tW9U+9gQrlTpTi68UrtV7Y1mVXvzVvGxw+NSj//Kk/kbnZ2fia5vkpiZ18OW/rsxACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQLAKEWovlSbAPBMpVQJO97ltb6qOZ3efRRwABBBBAoBQFqgKa3NqdplprU0cp3k7F7FmrqhXRF8LINfUNctH63rzc+/TEuHzhEx8R9T3eYnNinHxJxDTysj6LIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUkQKi1mJ4Ge0GgzATCG6/YLKLd5L6t3T2pQR/3OfQRQAABBBAoZYGBTa2p2w+ERKuLpo4zUhQCWk29vY/ey7bZx7k++PJf/Tc5dfzVhWWsIGts8CUx5xYqtuZ6beZHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEik2AUGuxPRH2g0AZCRiGllKltas+LDvXNpbRXXIrCCCAAAIIpAqsrUv/806PtqeezEhRCGg1DfY+Nrxpq32cy4Nv33+vPPfzn9pLxE69KubMpN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKg0AUKtlfbEuV8E8iXQ3V8tYuxzL3fnxjSV69wn0UcAAQQQQKAMBAbS/MxTwcn4x9yXwf2V2y0kh1p7f+vynN/ek9/7jjz+yDfsdYxzJ8WcGLb7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKVKECotRKfOveMQB4EQsExK9CqWcHWxVYV0OTW7ubFAY4QQAABBBAoY4Gr2utlS9TxozB+t1RrLb6HHg8a64H4xmrqG+Si9b053eSRZw7Lt+//e3sNc3JEjKETdp8DBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFIFCLVW6pPnvhHIsYApste9xJ6eVomEFwIj7tfoI4AAAgggUI4CezakVijX6ptEAqFyvN2SvSetpt7ee+9l2+zjXBxMT4zL//ir/0fmZmfj05vnpyR28uVcLMWcCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQMkJEGotuUfGhhEofoFAzxU3WVVaN7t3entvi3uIPgIIIIAAAmUtsGtds7TXBJ33qOmiR/iZ6EQpbE+rabA30HfFW+zjXBw8fN/fyNjw8MLUsTkxTr4kYhq5WIo5EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZITINRaco+MDSNQ/AKapqVUad25tlF6GqqKf/PsEAEEEEAAAR8FqgKa3NBlVWZ1NT3abv3+B38Vd7EUrKtV1dprd/e90T72++DpQwfkfz3+mD1t7PQxMecWKrbagxwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACFSzAv6RX8MPn1hHIhUCod/tWq9zYte65b+lODfS4z6GPAAIIIIBAOQrs62sXFW51tEBItNqIY4hOgQT0gEgwHF88FA7LRet7c7KR4VOD8q3777XnNseHxJwcsfscIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiIEGrlXYAAAr4KmKb5XveEqkKrqtRKQwABBBBAoBIF2muC0t+ZGmDVmzsrkaPo7lmrrrP3lKtAq1rgy//vn8v0xHh8LXN2RlSVVhoCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgFOAUKvTgx4CCKxGoHtrVMTY557i9t4W9xB9BBBAAAEEKkrgrr62lPtVH3mf/LH3KScwkBeB5GfQ3XdpTtY8+NUH5ZXnf70wt2mIcfoVq7C9kZO1mBQBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZQFCraX89Ng7AkUmEArqd4ho1cnbioQDsqenNXmIYwQQQAABBCpOYFtLnWyJOn5Exg30pjUVZ1FsN5xcqXXt+g2+b0+FWR97+Cv2vMa5U2LOTNp9DhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEFgUItS5acIQAAqsUMExtv3uKW7ubpSqguYfpI4AAAgggUHEC79uUplprnVXkPBCqOItiuuHkSq0X9fT6urW52Vl56HOfEfVdNRVmNYZf93UNJkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKSYBQazk9Te4FgQIKBHquuEnTpNu9hYFNVGl1m9BHAAEEEKhMgRsubpL2mqDz5jVd9MbUsKvzJHo5E9ADIsFwfPpQOCwXrfc31PrwfffIqeOvLmzfNMQ4dTRnt8LECCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQDkIEGoth6fIPSBQBAKaZqRUad25tlHW1i0ERYpgi2wBAQQQQACBggqoyuW7e1J/2UNvtMascCst/wJauNpetH3tJfaxHwfPPPWE/OwHB+ypjDPHxJxbqNhqD3KAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIOAQ4F/PHRx0EEAgG4FQ7/atInq/+9rdPc3uIfoIIIAAAghUtMDtPS2iwq2OFgiJVhd1DNHJk0Cwyl6o42L/Qq1jw8OiqrQmmjkxLMbYUKLLdwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBJQQItS4BwzACCGQuYBjGXvfZW6LV0t8ZcQ/TRwABBBBAoKIF2muCoiqZu5vetMY9RD8PAlposaJ885pO31b87pe/ICrYGm+xOYmdPubb3EyEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLMAodZyfrrcGwL5EOje3qFp5h3upfZsSP14Zfc59BFAAAEEEKhEgbv6UgOsWlWtaNV1lchR2HsOJoVa2zt82csrz/9afvaDA/Zc8UCrEbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILC1AqHVpG15BAIEMBELB2G1ixXCST1VV6Hata04e4hgBBBBAAAEELgioaubqy930Jn9Cle556S8toIUXn0N71yVLn+jhlW/ce499tjk1JubkiN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgeQFCrcv78CoCCCwn0N1fbYrc7T7lhq4mqQpo7mH6CCCAAAIIIHBBIG211tqISCCEUR4FtFCVvVrLmtWHip/83nfkxNEjC3Oahhhnj9vzc4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLIAodaVjTgDAQSWEAgExq+1qrSmJED29bUvcQXDCCCAAAIIIKAEdq5tFFXZ3NE0XfQoP0MdJrnsWN4SDMdXCIXD0rymc1WrjQ0Py8GHHrDnMEZOizk7Y/c5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYWYBQ68pGnIEAAksLpFRpTRvSWfp6XkEAAQQQQKAiBVRF8zs3tqXcux5psX5fhL+ip8DkYEALLoaKI82W+yqbCrSqYGu8xebEGB5c5YxcjgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSeAP9iXnnPnDtGwBeBUO/2rZomV7knG9jY6h6ijwACCCCAAAJpBHZ1N4sKtzpaICR6Q5NjiE6OBCzrRGtuTyk8n3gpo+8njh6RJ7/3Hfvc2OljIqZh9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMhMgFBrZk6chQACLgHDMPa7hmRLtFquaq93D9NHAAEEEEAAgTQC7TVBufGS1ACr1tie5myGfBdICrXW1Desavpv3HuPfb05PS7m5Ijd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIHMBQq2ZW3EmAggkBLq3d2iaeVuim/j+vk2pH6OceI3vCCCAAAIIIJAqMLAx9WenVlUrWs3qQpapKzHiFtACAXuooanZPvZ68PShA/LK879euMyqzmqcfc3rFJyPAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIHBBgFArbwUEEPAsEAwad4ho1ckXqmpzN1ycWm0u+RyOEUAAAQQQQMApoKqcb2upcw5aPT1KtdYUFL8HglX2jJGmFvvYy8H0xLg8+qUv2JcYo2fFPD9l9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPAmQKjVmxdnI4BAd78VZjX3uyF2dTdLVUBzD9NHAAEEEEAAgRUE7upLU621LioSCK1wJS+vSiAQtC+PZFmp9eBDX5Kx4eGFeWJzYgydsOfkAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuwChVu9mXIFARQsEAhM3WVVaO5IRVJj1zjQfn5x8DscIIIAAAgggkF5g59pGURXP3U1vcvy4db9Mf5UCWnAxNFxdV+95NhVmffyRb9jXxc68Zv3ej2H3OUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8C5AqNW7GVcgUNECpmne7Qbo74ykDeO4z6OPAAIIIIAAAukF9vW1p7ygNzRZv0fCX9dTYPwa0AP2TJHmFvs404PHvvlV+1Tz/JSYExcqttqjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBXgX8m9inE+AhUsEOrdvlXXZaub4EOXUknObUIfAQQQQAABLwI3dDWJqnzuaIGQ6I2tjiE6xSGgqrQ++b1H7M0Yw4P2MQcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtkLEGrN3o4rEahAgVhKldYt0WpRXzQEEEAAAQQQyF6gvSYou9Y1p0ygNXivIJoyCQPpBZKq4IbC4fTnLDGqqrTOzc7GX41XaZ0cWeJMhhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEvAgQavWixbkIVLJA9/YO05Sb3AR39a1xD9FHAAEEEEAAgSwE9mxIrcqqVdWKVhvJYjYuWUlAC4bsUyJNqYFi+0XXAVVaXSB0EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAR8FCLX6iMlUCJSzQDA4v19Ec5RkVVXldq5tLOfb5t4QQAABBBDIm4CqfH5Ve33KenpjW8oYA4UT+MW/HaJKa+H4WRkBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpcgFBrmT9gbg8BXwS6+60wq3aHe67dPa1SFdDcw/QRQAABBBBAIEuBgY1pqrXWRUULhbOckcv8FJibnZXHvvlVe0pz7Kx9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwOoFgqufghkQQKDcBUKhsdtMU+tIvk8VZr29pyV5iOMiFgh3rRe9vlFCnRfbuwx1dFm1d2vj/djIWYmNDC0cn1PHZ2X+3FD8u30BBwgggAACORdQFdC76sNyfGLWsZbWuEbMs8cdY3TyL/Dk9x6RseHhhYVjc2KMLfzszP9OWBEBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoTwFCreX5XLkrBHwViMW0/bqrrrMK3bTX8H8hvkL7OFnVuk0SXrdZwt3WdyvQqgVDK8y+Ke3rKtx6/uUXZNb6Ut9V3+/WMvAxUfvNpKn1T33u7kxO5RwEykogevOA1G67JqN74s9JRkxFfdKdVrXWT/3idcce9UiLGEMnREzDMU5nFQKat2rz7iqtxrmTPI9V8HMpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkE6ARFo6FcYQQMAWCG24/CpTZKs9cOHgrr417iH6BRbQraqrdVfvkLrffqeoYz9aINpqBenU10KYToXlxn/4qEwdftKP6ZkDAQQQQCCNwK3dzfLX/zko52PWT+BE0wOiN7aKMXI6McL31QoEVvqFD+cCTx864KzSOur/L3o4V6SHAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJ4AodbKe+bcMQKeBKw4TUpZzG0tdbIlWu1pHk7OnUAuwqxL7VaFXFXFyPq3Xidjh74lM88dXupUxhFAAAEEshSIhAOyp6dVHnjxjGMGLdImQqjVYZLPzr9/7zv2clRptSk4QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHwVINTqKyeTIVBmAt3bO0Ri17rv6q4+K1RDKwqBqt5Lpende32rzJrpTQXbOqV59wdkbvCYjDzypfj3TK/lPAQQQACBlQV2rW9ODbWGq0Wri4o5ObLyBJzhq8AzTz0hJ44eWZjTNMQYP+fr/EyGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIILAgoAOBAAIILCUQDJpWlVbNUZK1vSYo/Z2RpS5hPI8CtduuiQdLVaXWQrVQ58XSMvBRUeFaGgIIIICAfwKqIvrOtY0pE+qRlpQxBlYvMHZueNlJ/u07D9uvG6Nnrd/5mbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+CdAqNU/S2ZCoLwEuvutMKtxm/um7tzYJlUBzT1MP88CKtAavXlAtGAozyunLqdCtapqK8HWVBtGEEAAgdUI3NLdlHK5qtSqWRVbaf4KzM3OLjnh8KlBOfLMYft149xJ+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAF/BQi1+uvJbAiUjUAoNH6HVaW1I/mGVJh1V3dz8hDHBRCoedOV8UBrAZZeckkVrm35gw+LCtvSEEAAAQT8EVCVWnsaqlIm0yJtKWMM5E7gZ4cO2pObU2NUabU1OEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8FyDU6r8pMyJQFgKxmOx138iudc3SXhN0D9PPo4AKjzZen1JAN487WH6pxhvfI6HOi5c/iVcRQAABBDIWuL23JeVcvbFVRA+kjDOQG4GnHztgT2yMDdnHHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+C9AqNV/U2ZEoOQFghve3K/rstV9I3s2WCEaWkEFaq/sF72+saB7WG5xFbqNvutOUd9pCCCAAAKrF9jT0yqqUrqjabrokdSwq+McOp4Ezp0aTHv+kWcOy9DJC68ZMTEnR9KexyACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgD8ClFz0x5FZECgzAWO/iDNAc1V7vWyJVpfZfZbW7aigaMNbr1vVpmeeOyzTv3xajIlRMWamZW7wWHy+qnWbRKz5w109Ur1566qqrapKrdV92+LrrGqzXIwAAgggEA+0qmDrAy+ecWjo0TVijJx2jNHxX8BRpXV8WMQ0/F+EGRFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEbAFCrTYFBwggoASsQGP3/Lx2k1tjYCNVWt0m+e7Xbrsmqyqt5vycjB36lkw/sxBmTbfv8y+/EB8+f+RXMv7DfxUVcq1/6/VS1XtputNXHKvd3k+odUUlTkAAAQQyExjYlBpqlWBYtLoolUMzI1zxrNHhoZRz5mZn5RePP2aPm+Op59gvcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4IuA7sssTIIAAmUjMD8ftKq0Olt7TVB2ri3ej7x37rZ8e8HOLs83pwKtw1+7TyafOhSvzprpBCrkOvQvfyvjP3o000sc56lQbCBKENqBQgcBBBDIUmBtXTjtz2G9sS3LGbnMLTA+cs49JIf/7TFRwVbVzPNTYs5MppzDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAvwKEWv31ZDYESlugu7/aim3c4b6JfX3t7iH6BRAItnZ6XnXswNdFVV/NtqmqrSoQm00LZRHCzWYdrkEAAQQqQWB3T3PKbWq1EdGqalPGGfAuMH4utQrrzw4dsCeiSqtNwQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQUwFCrTnlZXIESksgFBzbZ+04mrzrqoAmt3anBmmSz+E4PwKhtg7PC82sItCaWGz04Ndl9vjRRDfj71XdmzI+lxMRQAABBJYX6O+MyJao9bsnrqY1tLhG6GYjMDbsDLUOnxqUI88cXpjKNMQYT63kms06XIMAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLwAodblfXgVgYoSiBnae903vKenVSLhgHuYfp4FtGBI9PpGT6vGRs6K+vKjTf3H456n0Ru87dfzAlyAAAIIVJjArvWpv2SiN7aKBEIVJuH/7Q6fPumY9GeHDtp9c2pMJDZn9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMidAKHW3NkyMwIlJRDsufxaXZet7k2nC9C4z6Gfe4FgW6fnRWLjVgjHpzZ/dtDzTF5DuJ4X4AIESlRA/Xnmz0eJPrwCbzvtL5pouugNTQXeWekv767U+vRjB+ybMsacVVztFzhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBdIOj7jEyIAAKlKaDJXvfGl/qoY/d59ItTINjW4dvG5s8NyfmXX/A03/zgMU/n+3WyXl0rNZdfI+p7INpifbVKoD4igaZWmT1+NL6MCumeP/Kr+Jc5n9/qeyrMWLVuk4TXrpdg58Xx/QSbFvaZbKD2qvZmzkzJ3MnjC/s+s7Bvwxrzq9VuuyZuk8l889b60798etlTa950pajQZqijSzTrGYQ6u+LP4tTn7l515WA1r5pftcT8bju1x9iEFei27GZ+8yuZtd63cwV6L6p9Kt+q3kvjIdaQ9WfSHWZN7Pe8tdepXzwpxsSouoyGQFqBqoAmt3Y3ywMvnnG8rjd1iDFy2jFGx5vA3OysTE+MS019gxx55rAMnbzwyxxWhdZ4pVZv03E2AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECWAoRas4TjMgTKSSC88YrNhiE3ue9pd0/qxxy7z6GfH4HYubOeF4qHO60A4EohxEwmVkG7oQc/m8mpBTsn3LVearf3x0OPWjD9R3GrMKlq6nudda4KjU49/biM/+hR8TMo6kYIWeHV+rdeJ2qPKmSbSVPnJlp137bEYfy7CuSq5zp1+EnHeDadGhW6vOCy0vUq2Jzu/aSCmrVWkLhu+9szvr+V1kq8rp6lCrKqZ5tsknjd/V0FX9WXaipMqpp6/85YZuM/fHTVwdr4hCv8j/Kou7I/7uEOsbovTexXPYPIjlvivpM/OWQHsN3n00dgX197SqhVAiHR6qJiTo4AtAqBsXPD8VDrb6xQa6IZE5apaSS6fEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLEAodYcAzM9AqUgYBiaVaXVdGy1qz4sO9c2OsboFE5ABS7VlwqqemmN198Wr/KpqkGWa1OhwOi7BjIKPLoNVGCy7uod8cqu5x6+P1651X3OavoqzNrwjhvFHUpdzZzqWhXWVF91v/1OGTv4dc9VdFe7fvL1yq/xutuSh3w5Vs9VBYFVoHWpkHKmC8VDt1Z4V1VNVUHgXIZbG37n9+L7znbP6n7V16j1XCefOpTpLXJeBQm01wTjP5+//5qzqq8ebZcYodZVvROGTw3Kmq5L5JmnnrDnoUqrTcEBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBcBPS+rsAgCCBSvQHd/tVXHcJ97g3duzKyapPs6+rkTyKpaq1UxsvX9H/c9VJm7u/Q2swr/qfvLpILncjOrsHDLH3w4Hnpc7jwvr6lwY9sf/nlO7VVotmXgY/EKn1725te5jTe+JyeBVhUCVs9VhVCzDYcudY9qzjUf+et4mHmpc7IZV/uM3jwQDzH7sWcVFFbz+TFXNvfDNcUtMJDmZ7RW0yBalbdffCjuu8z/7k4df1WmJ8blxNEj9uLm9Lh9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQO4FCLXm3pgVEChqgVBwzAq0alawdbFVBTTZ00OodVGkOI5iI0NZbUQFNpt3fyAefsz0Y+azWijPF6lAZdO793quXrvcNlWIMPGR9cudt9JrKtCqKrTmq9W/7fp4ADJf66mgpQoB123v931JZafer16rEnvdiAqNqveQHy3+Z8znULTalwrg+rVHP+6TOYpH4Kr2etkSdfzojm9OVWulZS9w6rVj8uzPf2ZPYM5MWr/3E7P7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5F6AUGvujVkBgaIWMEX2ujeoAq0q2EorLoHzr7ywqg2pQKuq6tn+wb+IhxFzHRpc1WZXuDiy45acBCrVsk1WsHU1NvkOtCaoVABSrZ3LlqgaqkK0foR/k/caD4ZaYdZ8hoFVKFeFcxP3lbyfTI/jAd+Bj0quAuPqudZdvSPT7XBeBQns2ZD6yydafZNIIFRBCv7e6omXjsjzv0gKtU6N+bsAsyGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIrChAqHVFIk5AoHwFAj1X3GRVad3svsPbe1vcQ/SLQGDq6cfFmBhd9U6CbZ3x6o9r7v58vLqn+qj31YT6Vr0hjxOogJ8KVeaq6fWNWQcrVdAzn6FMt0H9W6+TQDQ16OY+L9t+oCESn1+t43dTVXfVezHfTT2z1VRDVc871HlxTretQtzhtetzugaTl57ArnXN0l4TdG5c00WP8DPciZJ5T1Vq/c0zh+0LzOlx+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CLj+JTw/i7IKAggUh4CmaVaVVqtWa1LbubZRehqqkkY4LBYBc35Oxn98UNTHpvvRVJBVVYFUX6rNPHdYzh/5lUxb3/0Iz/qxR/ccKnDq1/27507u11x+jYwd+pYocy9NVf5cTTNmpsS0vrINpqpn2nj9bTL8tftWs41lr1Xz+x2CVu9Bvyu/LnsTrhfV+rMvvyBTh590vbJ8VwXE81FFVXmrtWgIJAuoiuq7upvlH587nTwserRdjBFrzDQc43RWFpieGBf1FW+WnzkzufJFnIEAAjkV+PDGv8np/ExeGAGea2Hcc73qGz79QK6XYP4CCPBcC4DOkgjkWICfwzkGLtD0PNcCwed4WX4O5xi4QNPzXAsEz7IIIIAAAggggAACJSdAqLXkHhkbRsAfgVDv9q2maVzrnu2Wbuuji2lFK6CqtTZYVTJVuNPvpqpkqi9VtXJu8JhM/e+nZObZwxIbOev3UlnP53eYcqmN6NW1UvOmKz2FHFW1zmwqjaog5bT1dd4KVSY3FfJUIVmvc6rz1f5VQNbvpsK22QZul9qLmi+bSqnqfTl7/KjMnz1pBVKfj08ftJ5BsLFFwus2ea6eqvYw+5o135nBpbaaMq4C1vl6T6YszgAClsCdG9vkgRfPyPlY0i+oBEKi1UbEnBzBaBUC5tQYweBV+HEpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkK0AodZs5bgOgRIXME3zve5b2BKtFlWplVa8Aqpy6LlvPyjNuz+Q0zCdCmg2qi8rtFesAdd0T0kFOecGj9sv6dU1nsONiYtVqNRL5c7a7W9PXJrRd+U6/LX/vmRoWFXNVV9VVkCz+Q8+7Ol5q2Crl71ntOEcnRS9ecDTvak/A5NPHZLxHz2aUkk3ORisqq9GrKqyKuCbSVPh1MiOWzKucqtCz9lUl1X7n/7l0zL1H4/HQ7lqb2rtcNd6CbZ2SsPv3JiT0HomBpxTegLtNUHp74zI918bdWxeb+qQGKFWh4nXjjl9oWKr1ws5HwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgVQKEWlfFx8UIlKhA99aoiLHPilI5bmDX+mZHn05xCqigo/p4+VwHWxN3nxxwVaHByZ8ckpnnDideLorvKsA5/sNH0wZEVSg0YoVz1X14aaGOLi+nS7UVgs20qfDtcoHW5HmU+dihb8UDxsnjyx17DeQuN1emr6mwpnpfqMqpqtqpMbEQsps7c9I+ds+lKtGq55NpU3OeffCejKqpqvfEjPVnpXXgoxJs68xoCeWmqiAn9r7cRbXW3r025TLyyIN2mDVxvbJTz1l9qX03vONGqX/b9YmX+Y7AsgJ39bWlhFq16jrRqmrFPO9/xeZlN1NGL5rTE2V0N9wKAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUDoChFpL51mxUwR8EwgF9TtM0aqTJ4yEA7KnpzV5iOMiFsh3sDVBoQKI6kt99PvEEwfjFSdz8TH3ifVW+r5StVN1vQoKDv3L30nb3o+L+qj7TJsKQqoqn5nen5e5Rx/9StoA7lJ7U5VJa7denXEwV1X9zFebPX5Upq0gpqo+mqlVYm/VVrVTL01VKVbB0EybCqee++Y/Sdsf/nlGl6iKqQ1vvU5GD3592fPV+8JLGFdNpmzU+1D92VmuqYCrCjGff+UFabEq9NIQWElgW0udqErrz47MOE7Vm9ZI7OTLjjE6GQrE5ggEZ0jFaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggIDfAoRa/RZlPgRKQMAwtf2as0ir3NrdLFUB12AJ3Eslb1EFW8/84yel8Yb3eA7YrdZNBTgbb3yPNPyft8jEjw9aAdcDq53S8/Uq0DpkVe3MJEipwo2jB74er27rZaFgW0dKVc1013sJtKrr1bPz2mZfO5pxqFWzQpe5bipcqgKj6jlk01RFVC/BUFXBNBs3tT8VCq67ekdG26y9sj8eKlXh0qVadd+2pV5actxrkFnd64gV4o3ePLDknLyAQELgfZva5CM/O57oxr9rdVZR9kBIxApo0rwJmFNj3i7gbAQQ8EVAc/8Hii+zMgkCCCCAAAIIIFC5Avz9qnKfPXeOAAIIIIBAvgXe8OkH8r0k6+VBgOeaB+QCLMFzLQB6HpbkueYBmSXyLqDnfUUWRACBggoEeq64yfr34m73JgY2ZV7B0n0t/cIJqGDh0IOfjYffMgl3+r1TVbEysuMWaX3/n2b8Ee9+7MFLoDWx3sxzhzP6WPnE+eq7qtyZSQs2tWRyWvwcFbDN5lnFRoYyXkM9l1w2Fbg8+09/mXWgVe2txmMwdMwKJWfbVOXTTJt65itVuq3qvTTT6eLnqWeu3n9emwryqkAuDYGVBG64uEnaa1y/q6bpoje2rXQpr18QaH/DRbaFOTNpH3OAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJBfAUKt+fVmNQQKLqBpxn73JnaubZS1dWH3MP0SElDht9OfuzsegMsmMLnaW1UhQPUR7/Vvu361U614fTaB1sSk8+cyD4aqazKuwGoFIc+//EJGX9PPPJ3YjqfvsXPLf2y9ezJVCTUXbfb4URn+2n1ZBXOT91P9piuTu8seq2e+mve1qrqqAuCZtvC6zUueqkKvXkOtEz95TJar/LrkYtYLKpCrQrE0BJYTUJXWd/ek/nKK3miNWeFW2soCXRs2Lp4U5O9EixgcIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvkVcJV0yu/irIYAAvkVCPVu32qaRr971d09ze4h+iUooEJ/owe/Hg/B1W67Rup++515rZ6qwn6qaqtqE08cyJmg+sj7bAOOcyePrViFM5uNq8ql6iuXzWuwMdMqs173PPLIg1kHNBNrqcBt1bpNie6K32etwPBq2/zZkxn/eQh3L723sLVvr5VwV/PeUGHY8R8flMbrblstAdeXucDtPS3yhedOyfmYuXingZBodVExJ4YXxzhKK9C1vlf+1xM/ir+mVdelPYdBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHcC1C6KffGrIBA0QgYhrHXvZkt0Wrp74y4h+mXsIAKwU3+x+Ny+t5PWB8R/xlRVVyzDYFmw6CCrdUeP1reyzrm3JyX0x3nqmAjbXUCq/FPrKwq+3ppcyePezk97bmeKrVa+1sqFBzwWAE3NnJWVKXZ1bSZZw+v5nKurRCB9pqgqMrr7qY3rXEP0U8j8IZLuu3ReKiVCre2BwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAPgWo1JpPbdZCoJAC3ds7NC12h/U5xI5d7NmQ+nHFjhPolLSA+qh49aWa+sh0FTatsb5y9dH0Caymd++1ArV/ueowX2I+v76b01N+TZXzeYJtnRKoXwycBzsvzvma+VrAazBUVR2usaoPr6YlW640jwq0qj8jKpDqbnpDamjQfU5yf/4MQepkD45zK3BX3xp59NURxyJaVQ90/ywAAEAASURBVK2okKY5M+kYp+MUqItEpLG5RUaHh6y/KukLZtPjzpPoIYAAAggggAACCCCAAAIIIIAAAggggAACFS6gac5/a65wDm4fAQQQQAABBHIkQKg1R7BMi0CxCYSCsdtMK6KRvC9V1W3XuubkIY7LWEB9BLr6Gn30K/GPfq/evC0edFXhSb+bCgVGrI9LH3rws35PXfLzqY+uD3V2xUOTyl63gqvB1k4JNrVIIFoZIXOvwdBQAQK96nmkDbVW1Xh6D3qpEOtpYk5GII2Aqr6uvp4dmXG8qjd1SGzwJccYnVSBizf0yi+ftkKtVtNqGsQk1JqKxAgCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQYwFCrTkGZnoEikKgu7/alLG73Xu5oatJqgL8Np3bpRL6519+QdSXHBRRwcp4FVcVcl23ybfbV3OpuQn1XaiSu+HSnIWIfXtoeZpIBXlLtXkN5M6PLgTkSvV+2XfpCXzo0g55/7+/4ti4Vmv9mQuERGJzjnE6ToG16zdYodafxge1mnrni/QQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBPIiQKg1L8wsgkBhBQKB8WutmmMd7l3s62t3D9GvQAEVOlVfk08dilcPrbnsSql789vjgdTVcqh5Rg9+fbXTlOT1NW+6UtSXCgyryrW0RYFAfeNip0iPFqrmWsFvV1OVdr00c3rKy+mci8CqBfo7I6IqsZ+enl+cS9NFj7aLMXRicYyjFAFVqTXRtOo6669OuohpJIb4jgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQBwHrX2ppCCBQAQIpVVp3rm2Mh14q4N65RQ8CxsRoPNx6+t5PyJl//ORCNVcP17tPVQHZSgt0qjBkyx98WJrevVeq+7ZV3P273wPp+noJhFrT7VuNaR5DrUvNwzgCuRJQFdjv3NiWMr0eaVkIaaa8wkBCINLUIu1vuGihawVa4xVuEy/yHQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgLwKEWvPCzCIIFE4g1Lt9q6bJVe4d3NWXGnhxn0O/sgXmBo/J0IOflaF/+VuJjZzNCkOFF8Nd67O6thQvqn/b9dK+/y/i1VlLcf/52nOgwfoo9BJtQSv0RkOg2AV2dTeLCrc6WiAkekOTY4hOqsCGS3/LHtTro/YxBwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC+REg1JofZ1ZBoGAChmHsdy++JVot21qsj9WlIZCBwPkjv5KzD9yzqmBrBsuU9CmqGm3LwMcksuMWKrNm8CTnzw1lcFZxnmJMTxXnxtgVAkkC7TVBufGS1ACr1tiedBaH6QTWbe6zh6nUalNwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORNIJi3lVgIAQTyL9C9vUPTYrdZnzfsWPt9m6jS6gChs6KAqtQ6/LX/Lm1/+Ocrnus+Qa+Aj2uP3jwgVes2uW/dU3/+zKDMnz0pcyePx6+bffl5x/VVGy4VVQm2HJo54y0YOnv8qJjzc3m99djEaNr1jJnptOMMIlBsAgMb2+SbR4cd29KqakWraRBzetwxTmdR4A2XrJPG5hYZHbbC91Z1W626TsyZycUTOEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyKkAodac8jI5AoUVCAaNO6xAa3XyLlT1thsuTq3elnwOxwikE5gbPCaqamtV76XpXl5yTG9oXPK1cnihum+b1LzpSs+3YljBzpnnDsv0L5+W2ZdfWDG0GYi2el6jWC9YKjC61H5Hv/sVUe+/YmjGEmHXpfamqvjSECiEgKrKflV7vfz09IRjeT3aLjFCrQ4Td6d7U5/850/+PT6s1UUJtbqB6COAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI5FCDUmkNcpkagoALd/VaYdWy/u0rrru5mqQo4K7cWdJ8svqKAqgJau+2aFc9LPuHcw/fHw5LJY34cn3/lBc+h1kC0xY+li3aOhnfc6HlvU4eflLEDXxcVbK3EZkyMebptvT7i6fxcnuz1mZX7+z+X1sy9eoGBja0poVYV0lQVSCWW3+rHq7+b/M3Qe+lljlCrDJ3I3+KshAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFS4gF7h98/tI1C2AoHAxE1WoLUj+QZVmPVO6+OIaaUlEBuxPgLZYwu2dXq8IrPT588MZnZihZylqqeGOi/2dLcTTxyQkW8/WLGBVoUVO3fWk1mgvniq/XoN5AaayqfCrqeHxslFIbBzbaOoCu3upjc5/nrgfrni+109G6W6pibuoIWrRQuFK94EAAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBfAoRa8yXNOgjkWcA0zbvdSy4VbnGfR7/0BYKtBJby8RSr1m3ytMzc4DEZO/QtT9eU48leq52GOrqKhsEYH/W0F/4seuLi5BwI7OtrT5lVb2iyfu+F/wxIgbkwEAwFZd3mLfbLWgPhdBuDAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCDHAvxrdo6BmR6BQgiEerdv1XXZ6l77rr417iH6JSDgtaqluiUVttSC1sdL+9yyqQDrNQTo85ZzOl2oa72n+Sd/8pin88v15NnjRz3dWthjeNjT5B5P9hzItSr56tW1Hldxnh7qLJ5Qr3Nn9EpB4IauJlGV2h0tEBK9kaCmw8TV2fRbV9gjeqTFPuYAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AqmfR5rb9ZgdAQTyIhCzqrQ6AyxbotWivmilJzB38rjnTevWx7XXXtkvk08d8nztcheE13oLcaq5YpNjy01Z0q8FWzs97X/2NW9hTk+Tl9DJxsSonH/5hXj4OpNth1Qw1HpPq+uybY3X3SZ1V+/I+PKz//QZSRe+Vfv22lQod+a5w14vi5+vwunRG9+T1bVchIASaK8Jyq51zfLQb4YcIFqDFdQcOe0Yo7MooCq1VtfUyMz0tEgwLFpNg5jT44sncIQAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBMBKrXmhJVJESigQPf2DtOUm9w7oEqrW6R0+uoj671Wh1R31/DW6yQQ9a8SX3XfNlFfXtu8tf9ybXp1jadby6bqbmKBUEd5Veuc+eXTiVvL6Lt6P6+mea0yPH/mZNrlEoHctC8uMVi77ZolXll5OLLjlnigd+UzOQOBpQX2bEj9WaBV1YpWG1n6ogp/JRgKyqati9VatfqmChfh9hFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CBBqzY8zqyCQN4FgcH6/VaXVUZJVVWnbubYxb3tgIf8FZrOoDqkqW7a+76O+BFvDXeslevOA5xsz5+fSVrv0PFGRXqCqaHppgabUYFkm16sKo16qjGYyZ6HPmbYql6r3R6ZNVR72GkxNnjtkvYczbSq4ulyQ/PxvfpXpVPHzVBhcVZv12hp+5/fK7rl7NeB8fwRUpfb+ztQAq97Y5s8CZTrLpdvfYt+Z3tBs/fWK/3SyQThAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciTAv8zmCJZpESiIQHe/FWbV7nCvvbunVaoCmnuYfgkJTB1+MqvdqkqtbXs/Hg/GeQ1gqgX16lqpf9v10jLwsfix102cP/IrT8FFr/MX+vzYiPPjvFfaT7C1Y6VTUl5XVT4br7stZbzUB1Rw1EtYW71/m259f1bvQxUOVe/lTNvc4PFlT51+xluVWTVZ4w3vWXZO94tqzw3vuNE9TB+BrAV291ihTFfT6qKihcKuUboJgTdcsk4am1sWunpAlBcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AsHcTs/sCCCQT4FQaOw209QcqTkVZr2950IgI5+bYS1fBWasqpYqBKiqr3pt6hoVilQf366qY6og4ezxoxIbOeuYSoX+Qp1d8TVURcxAtEVq3nSlZBOGTUycbRg3cX2xf5+3DKs8bLJue7+oZ5lJU+6R628TdU25NvX+qOq9NOPbU9VOm969V4a/dl/GYemqdZs8h0PnBo8tuyf1Z2f+zKCnyrGJasejj35l2b2rP3vqz6sXl2U3y4sIXBBQFdu76sNyfGLWYaI1rhHz7PJBbscFFdZ545vfIk/94ED8rvVIi8QmhitMgNtFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE8itAqDW/3qyGQE4FYjFtv+6qv3zjJU3SXsMf9ZzC52ny0QNfjwf6sl1OhVtVQDJfIUkVDMw0wJntPRX6Oq+VWlVQse7qHTL51KFlt67Oa3jH74kKQpZzm/7l01JrvSdV8DTTpmza9/+FjP/wUVkuNK3e7yrIXXtlf6ZTx88z5+dk4iePrXjN1P9+SiI7blnxvOQTVNXdYGunTP7kkKh7T7REiLy6b5uoc1YTJE/MyXcE0gncubFVPvWL1x0vqaCmMXRCxDQc43QWBN505WKoVauNiARCIrE5eBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciRA0i1HsEyLQL4FQhsuv8oU2eped2Bjm3uIfokKxAOAVuCtVCo4jn73KyUqnfm2V6romW4mVYWzevM2mfjxAVEfcx+vwHuhSm4g2ir1VhBThRwrpZ17+P54SFVVCs60KafozQPS8Ds3yvy5IZl95QX7Ur0+Eg+OqkBwNuHQqacfjz8Te8IlDlQwuW77262Kxq1LnJF+WO0r3LV3VQH19DMzisDKArd2N8tf/+egnI9Zf2NIND0gemOrGCOnEyN8TxKINLXIGy5ZJ6+/+nJ8VG9owirJh0MEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAbwFCrX6LMh8CBRKw4il3u5fe1lInW6LV7mH6JSygAoAtAx8V9THsxdxU4G/2+NFi3qIvezt/5FeiPorea7BRVSb1Up3Ul80W6SQq1Dv66FeyCnkqd/Xll6Wq0jr+44MZSalzVfXk5t0fyOh8TkKgGAQi4YDs6WmVB14849iOFrF+AYZQq8MkuXPp9rcshlqbOsQYPUtl22QgjhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEfBVwfVO7jzEyFAAL5E+je3mEtdq17wbv6qNLqNin1vjEzJUMP3mNV+DxWtLeiAq2jB79etPvze2OZfFS9H2uqEGW5NlWFeOrwkwW/PfXeVSHbTNvMc4dFfeW6qT/3519erEab6/WYv7wFbu9tSblBLVwtWl00ZZyBBYE3vvkqqYtEFjqBkGi1F44BQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwXYBKrb6TMiEC+RcIBk2rSqvmKMnaXhOU/k5CF/l/GrlfMRFsLcaKreM/elTGf/ivuUcoohWmf/GkNLzjRtGra3O2K/XMR779YFlXBR2zqp76WXXV68NQodqxQ9/yelm8WmvYqrybq+evwsznvnafhNdt9q0ireeb5IKyEuhpqJKdaxvl+685A9yBzp6yuk8/byYYCspb3rFDfvivC/8fgZWfusyFAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIOAWo1Or0oIdA6Ql091thVuM298b39bVLVUBzD9MvE4FEsHX2+NGiuSP1EfKVFmhV+OpZDP/L30quKqkmnnU+KoIW8s20cJ+fFRWMzndTgVYVGs6mxUbOxqsne6nw6mUdFfZVVVqVDw0BvwRu6W7ya6qKmWfz1itEhVtpCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FSDUmltfZkcg5wKh0PgdVpXWjuSFVJj1hi4CK8km5XisQm5n/+kzMmxVcZwbPFawW1RhyzP/+EmZ/I/HC7aHQi+swsXnHr7f920kAq2FfL6+39QKE6pgtHpP5yPEqYLIE08cyDrQmrgV9XzO3P+XMn9mMDG06u9qbyoonvhzZU4Tal01KhPYAqpSq6rYSstcoC4Skd/67bdmfgFnIoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAghkJUC5oazYuAiB4hGI/f/s3Qt0HPV99//vzO5Ku7qs7rJkW0ZClm/Bxo5j10CghhOuCeAGp/iPnWKcUzC0T2hLW5q2OTlp+vRp+jR92ue0ef5NT8i/TdqSPqEtpIUGcoBAuAdIwAkXAyYYfJFtWZatiyXtzn++I2u9O7uSZla70l7ev3M2mpn9zW9+v9esHHH00Xdicpvpiqdv7WqU1gjf3oVzl/I7Ew2V6iu8cp3UXbPNeYR7fq84MbpeU6tqllPgcjpX9ThmV2xt+OQuMWvqpuvq6T2tHjrw8H2SXAFUt3MxtqcJzGMntTzyN190Ps/6uc5HG371eTlhV0FN9p3NdbRi69Gv/Yk0/spvSkXHubMZypmTBnuTKzHr+DQEcimwvadJ/uilA64hrZHx8Xi7vPvjftcbJbkbXPphO4k+8YdBgwMDosHV6dovXHq5vPjEY4kuhmGuG9v7wo8TB9hAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJi1AKm3WRMyAALzJxBc+pHNItZa9wx2LG12H2K/DAQ0CKivyOqNUtG1XCo7l0uwpT2nK9eQ3ei+12V4zwuEWTPInt67R3r/6g8laoeLq9ZdlKHHzIfUeOChf04JNE6epdU7y6VpiFODnRrijazZKNUf+cVZf541wHp63xty6smH8vL5nayeXGl//9VcfI1U9pzn63bp/IZeesqZn7tSbTnde19odM5aYEd3s3z5JwfldMxKGsMIh4LmTvtfmr9MOliym5ZljBjGxPJisZn/fdXQa8/qNbL31VeckyzLutneINRasp8QFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS5JgI5E4jfaVcYSxltU2uNrKoPpxxjp7wEtAKlvrSZ4aqzAdf2JWIEQ56rSOqj1GOnBmT03TecIKuGLfMRrBu2K5LqNbw2ayT7x7CPHdrvVJf1ei3t77dpGLH/X+9xqqxG7CqjYTtkrCHHqZr21zCsvkbs13SVQ089833nnk41VvJxr05z6Z88P6/b6jH49CPOS0PaTmh7sV0J1f4sh1rapq1c63yG+4/JyFt77M/wG3kJsmZahwZn9aXzrbn4aqd6crChKWMVZf2+0jUmf99mGnP8yCHPn113IDbTeBxDoDJgiAZbv/7mkRSMuGXYP1uUR6g1ZeEedy684uOJUKv9h0W/EerZ8PdUa/WIRzcEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAg0BqGs7DCXRBAIHCEAivWNs5Ph7Y557N1z7aKVcunv2jz93jsl+aAvqYdA26atMAq4YAafkTmPTWcLCGGWm5EdBqrhpw1TZmhz+nCwbn5orZjxKobxYjFOJ7LXtCzsyhwPuDo3LRd19LG9Gy5Jdib7/472lvlNiBQPf6Z+xKrZt0WZ+5+/PS2LrA0wq/83dflX2v/+xMX+vfx9966Zc8nUgnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEZBajUOiMRHRAoTIHx8aBdSS35kcEirZEggdbCvF0FOyuClXN7a/DOj7eGWE/br2Josf6jxTBN5lgmAourK5yfG773fur3j2EYt9kEJR9qNYz4iF3T3LnbgycHPIdaL/n49UmhVmOLXa11LdVay+SbhmUigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDeBSZ+i5v3y3ABBBDIqUDn5rAdaN3pHnP3ylb3IfYRQAABBBBAAIEpBW7qbszwnnWVBjUzvMEhW6B14SLpWb0mYWFZsS8kdthAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJiVAKHWWfFxMgLzIxAKDuy2r1yffPXKgCGf6swUTEnuxTYCCCCAAAIIIHBWYHN7VFbV238r42qWZd3sOlRyu3ZF2kPZLurCKz6edOpEtdakA2wigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCGQpQKg1SzhOQ2A+BWJxIy1osqO7WaIVgfmcFtdGAAEEEEAAgSIU2Hpupj+Kie+Wzg1tRbgcz1OOx42Ryc4jQ8OTm56+Uq3VExOdEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAd8CQd9ncAICCMyrQLD7w1eJIWmPBM4cSJnXqXJxBBBAAAEEECgCAf3DmP/3tV7pHR5Pmq0RDgbjB2Xp+qRjpbVpGCKWNbGm0yNDvhen1Vr3vvrKmfOMLcGl68+M5nsoTkAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQOCNApVY+CggUm4Aht7mnPNWjg9392EcAAQQQQAABBNwClQFDru1ocB8u+f3JQKsuNDaeHOj1tnR3tVZvZ9ELAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGA6AUKt0+nwHgIFJlCxbP0KEWOLe1o3dWd6bLC7F/sIIIAAAggggEBmgd0rWzO/USZHTw0MZLVSrdZKQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDInUAwd0MxEgII5FsgHjfsKq2pT7btrq2UKxfX5fvSjI8AAggggAACJSzQGgk6P0987/0TKau0hk9K7IM3U46Vyo4ZbRKztdNZzsn+vqyWpdVaV2/cJK8+/+zE+eOjMv7zn9o/rsWzGo+TEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIUCC5dn1g2lVoTFGwgUOACnZvDIvHd7llu72lyH2IfAQQQQAABBBDwLbBrWXPaOUakVozKqrTjpXDAGhtNLCPbSq06wMVXXyfhSGRirGCFmA1tiXHZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYN4FQcMAOtBp2sPVsqwwYsqM7PYBytgdbCCCAAAIIIICAN4FNrTWyqj7lRw3nRLO+1dsAxdYrNpaY8eBAaoXaxBseNqqjUbnwimsSPc2GBWKEKhL7bCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBcg1Ordip4IzKuAJXKbewIaaNVgKw0BBBBAAAEEEMiFwGeWt6QNY9Q0iARCaceL/YA1nhRqPXVyVstZf8ml0rpw0cQYhilmc8esxuNkBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFwFCLWW651n3UUlEOhev8Wu0rrCPentPU3uQ+wjgAACCCCAAAJZC1y7pEFaI8HU8zWkGS3BnzniMREr7qx1cGAgdc1Z7F2+dVviLKO6XvRFQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYFwHDMNKqtF65uE66ayvnZT5cFAEEEEAAAQRKU0ArwG/tbExbnFnfav99TQn+p0NsPLHWgePHEtvZbCw8p0tWb9yUODXQYldrLUWzxArZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHIvUIK/mc49EiMiMJ8CoZ4Na+0yYle553BTd3rgxN2HfQQQQAABBBBAwK/ALctaRMOtKS0QEqMqmnKoFHassdOJZZzKQbXWi6++TsKRyMSYwQpxwsCJK7CBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIDCTAKHWmYR4H4F5FrAs62b3FFbVh2Vze+kFS9zrZB8BBBBAAAEE5l6gNRLM+HOG2dA295PJ9xVjY4krnOjrS2xnu1EdjcqFV1yTON1sbBejIpzYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYHoBQq3T+/AuAvMr0Lm2XiS+2z2JredSpdVtwj4CCCCAAAII5E7g9pUtaYMZ4WoxKqvSjhfzAev0cGL6Rw58kNiezcb6Sy6Vhed0TQxhmGK2ds5mOM5FAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykqAUGtZ3W4WW2wCoaC5U8RIKe8VrQjIju7mYlsK80UAAQQQQACBIhJY11QtWhne3cyGBe5DRb1vjY4k5t935FBie7YbV2/bIcFQ0BlGw8Bm48LZDsn5CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQFkIEGoti9vMIotVIG4Zd7rn/qnORqkMGO7D7COAAAIIIIAAAjkVuH1leoDVqLaLyAdCOb3OvA42Ppq4/LHDhxPbs91obF0gl12/NTGMhoFLrcptYnFsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAjkUINSaQ0yGQiCXAoHu9VsMQzrdY+5aTpVWtwn7CCCAAAIIIJB7gSsX10lrZKLaaGJ0wxSzriWxW+wbKZVaew/L+Nh4zpZ0/gUflY7unonx1K31HLsAP//5lTNgBkIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZIU4LeqJXlbWVQpCBhGPK1Kq4ZLFldXlMLyWAMCCCCAAAIIFLiAVoa/qTv9j2nMOvtYqYQzrbikBFuP5K5aq97ea/6fHRIMTQSDtVKr2dBW4Hed6SGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzK0CodX79uToCGQVCPRvWipib3W/uWpYeLHH3YR8BBBBAAAEEEMiVwPbuJtFwa0oLhMSork85VNQ7YyOJ6R87fCixnYuNaEOTXHb91sRQZsMC0XArDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgswCh1swuHEVgXgXi8fht7gmsqg/LptYa92H2EUAAAQQQQACBvAm0RoKileLdTcOZpdKSK7UeOfBBzpd1/gUfla4VqybGtSvcBtq77b9dCuT8OgyIAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIIAodZSuIusobQEOje0GYa1072oHUup0uo2YR8BBBBAAAEE8i9w+8r0AKtWGzXC1fm/+BxcwTo9nLhK35HcVmqdHPjqG3dIOBKZ2A1WSGBB5+RbfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSBIg1JqEwSYChSAQCsa2iR0TSZ6LVknb2tWYfIhtBBBAAAEEEEBgTgS0Wvy6pvQAq9nQNifXz/tFxkYSl9j/1t7Edi43qqNRuerGTyeGNKrrxaxvTeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwIUColU8CAgUmYInc7Z7StR0NUhkw3IfZRwABBBBAAAEE5kTg9pUtadcxqqIigVDa8WI7YJ0eEonHnGmPDA9L74EP8rKEntVrZP0llybGNps7xIjUJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAQIdTKpwCBAhIIdK/fYldpTSl7pmHW3Sup5FVAt4mpIIAAAgggUHYCm9ujopXjU5phlky1UWv4ZGJp+996M7Gd641LrrleFp7TlRg20GZvl0AwOLEgNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFZChBqnSUgpyOQY4G0Kq0ZQyQ5vijDIYAAAggggAAC0wnoH9ncsiy9WqsZbbL/Hqf4/5MiOdR65GB+KrWqbzAUlGs/vVPCkcgEtx1odYKt0+HzHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFBGAsX/G+gyulkstbQFQj0b1hqGbHKvMtPjft192EcAAQQQQAABBPItsLWzUTTcmtLsUKZZ25ByqBh3rOFTiWm/99bexHY+NqINTXLVjZ9ODG1EasVsXJjYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZwFCreV891l7QQnE4/E73RNaVR+WdU3V7sPsI4AAAggggAACcy7QGgnKdeekB1iNutY5n0uuL2iNjohYcWfYE33HZOD4sVxfImW8ntVr5MIrrkkcMxvbxaiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpVgFBrud551l1YAp0b2gzD2uae1GeWpz/m192HfQQQQAABBBBAYK4Edi1L/9nEqKwSrTZa1M0OtFojg4kl5Ltaq17ooiuvkY7unsQ1A23nilrSEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIWINRazneftReMQDAY3ylihJMnpNXQrl2SXg0tuQ/bCCCAAAIIIIDAXApoFflNrTVplzTrS6Ba6/CpxLref+etxHY+N67dcYvUNTZNXMIMSKC9WyQQyuclGRsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaAFCrQV9e5hcWQh0brbDrNad7rVu7WyUyoDhPsw+AggggAACCCAwrwK7ljWnXd+ori/6MKY1fDKxrrmo1KoXq45GZcstt0o4Epm4drBiIthq8J9piZvBBgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlJUAvy0tq9vNYgtRIBA4tcWu0tqWPDcNs96S4fG+yX3YRgABBBBAAAEE5kPgysV1ohXl3c1sSPlxxv12we9bI4Mi8ZgzzxN9x6Sv9/CczLl14SL5hF2xdbIZ4WoJtHVN7vIVAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCuBQFmtlsUiUIACRkP7Nw1DUlIgH19SL5/qaizA2TIlBBBAAAEEEEDALspqGvKDg2crm6qJEaqQ+Ikj9pZVpESWGJURMSomqqbW1tfL4q7uOVlLQ3OL1NhVW9/+2R7nekaFXcjfrtaaXD12TibCRRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF5EDAbFyauSqXWBAUbCMy9QKhnw1rTlLXuK9++coH7EPsIIIAAAggggEDBCFzb0SBaWT6lBUJi1jWnHCq2HWtoIDHlfa+/ltiei43zL/iorL/k0sSltPKtGW1K7LOBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIMAodZyuMussYAFYne7J7eqPiz6oiGAAAIIIIAAAoUq0BoJyo7u9ACrUVvcIcz4qX670GzcYd//9l4ZGR6e01tw2fU3SM/qNYlrmi1LxKiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBEpdgFBrqd9h1le4Ap0b2ixLtrgnSJVWtwj7CCCAAAIIIFCIAlvPbUybllFZVdwhzHhMrJHBxLre2vOTxPZcbXxi+y5pXbho4nKGKYH2blFXGgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFAOAoRay+Eus8aCFAgGrd0iRkpJVq16duXiuoKcL5NCAAEEEEAAAQSSBbSy/Ob29CqiZl1Lcrei27YG7WqtZ9pbP31lcnPOvgZDQdn6q78mdY1nqt5qsHXRMoKtc3YHuBACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMynAKHW+dTn2uUr0LnZDrPGb3MD3LKsRSoDhvsw+wgggAACCCCAQEEK3NSdoVprdb0YoYqCnK+XSSWHWve9/jMZHxv3clpO+1RHo7Ltjs+eDbaaAQksXFrUrjkFYjAEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWVghC4RCA9vsKq1tyXPUMOvWzvRgSHIfthFAAAEEEEAAgUIS0ArzHTXpAVajbkEhTdPXXKyxUbFODznnaKBVg63z0aINTbLlllslHIlMXD4Qsiu2LifYOh83g2sigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMCcCRBqnTNqLoTAWYFYzLjz7N7E1nXnNEhrJOg+zD4CCCCAAAIIIFDQArcsa06bnxltsv9+p3j/U8MaPJFY009ffDaxPdcbrQsXyY13/IYEQ2d+RgxWiNnWLWJXbqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShQvL9pLsW7wZrKQiC09MObTFPWuhe7a1mL+xD7CCCAAAIIIIBAwQvs6G4WrTif0uzQpVmXHnZN6VPAO9bJo4nZaaXWwYGBxP5cb2iwdcvOWxPBVqOyyq7Yuoxg61zfCK6HAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzIkCodU6YuQgCZwUskbvP7k1srWuqllX1Yfdh9hFAAAEEEEAAgYIX0ECrBlvdzYgW7x/sWGOjYo0MOksaHxuXt376int5c7rftWKVfGL7rsQ1CbYmKNhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIESE3CVVCqx1bEcBApNoHNDWzAY22c/jzclwfq1j3bKlYvrCm22zAcBBBBAAAEEEPAk8PbJ03LZf76e1jd28G2xBvvTjhfDAbOuRcyWJc5UF57TJds/e9e8T3vPC8/KQ/d+KzEP6/SQxD54UyQeSxxjA4FSEbAs+88BaQgggAACCCCQdwHD4FcEeUcukAvw81WB3AimgQACCCBQ8gLl9PMVP1+U/MeZBSKAAAIIFIhAufx8EVy6PiFOpdYEBRsI5F8gGLTsKq2pgdbWSJBAa/7puQICCCCAAAII5FGgu7Yy488zZrQpj1fN79Dxk30iVty5yIGf75O+3sP5vaCH0c/bsEmu2Lot0ZOKrQkKNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoEQECLWWyI1kGUUg0LnZrs4aP5tCODPl3Stbi2DyTBEBBBBAAAEEEJhe4IbOhrQORnW9GBUpBerT+hTsAbv6qTU0kJjenheeS2zP58b5F3xUrt62IzEFgq0JCjYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAEBAi1lsBNZAnFIRAKndxpV2ltS55tZcCQazvSAyDJfdhGAAEEEEAAAQSKQeDKxXWiFVvdzYi2uA8VzX584Fhirnt+9Gxie743tGJrpmCrEaqY76lxfQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGBWAoRaZ8XHyQh4F4jF5DZ3761djdIaCboPs48AAggggAACCBSlwPaeprR5m3XNImYg7XgxHHAqtdoVW7UNDgzI/rf3Fsy0MwdblwvB1oK5RUwEAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSyECBNlwUapyDgVyC49CObRay17vN2LLVDHrSSFFhw15clUO///sZPnZDDX7lbrPGxeXExw1XSeucfi1lT5/v6Ou9Y/1Hf53ECAggggEDpCOzobpa/3HNYBkYngqDOygxTzGiTxPt7i2+hVlziJ/vErJuoNvvik49JR3dPwaxDg62BYEj+69vflPGxcZFghQQWLZfYB2+INTZaMPNkIgjkSuC3v/9YroYqqHH+/GOXps2HtaaRFN0B7ivfr0X3oXVNmM9w+XyGXbee3TIT4GeO4r/h/HtdPv9e8/3K92sxCZT7v03FdK/yMdcDn/9MPoad9zEXfunraXNgrWkkRXeg3O8rP18U3Uc2bcLl/v+55fQZTrv5ZXCASq1lcJNZYiEIxO90z2JTa42sqg+7D7Nf5gIaJq255Jp5U6i99LqsAq3zNmEujAACCCBQUAKVAUM+1dmYNiezfkHasWI5YA2c/YONva++IgPHjxXU1FeuWy9bdt4qwdCZv1fUYGvHKjHC1QU1TyaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBEg1OpFiT4IzEagc0ObiLHFPcSuZf6reLrHYL80BWouvjqrKq+z1Qi1L5GqjZtnOwznI4AAAgiUucCu5Rl+xrGDlkZ1fVHKWKeHxBrsT8z9R088ntgulI2uFatSg61mwK7YuqxozQvFlXkggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMDcCxBqnXtzrlhmAsGgdbd7yR01FXLlYv+Pd3ePw35pChj2Y4Trrtk254vTKq16bRoCCCCAAAKzEVhcnfnnHLOuZTbDzuu58f7exPV/8syTMjgwkNgvlA0Ntt54+50SjkQmpmSYEmjvFrO+tVCmyDwQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBGQUItc5IRAcEZiHQuTksYu10j3ALVVrdJOy7BMIr10llz3muo/nb1evpi4YAAggggEAuBG7qbkwbxqiKilFZlXa8GA5YwydFK7ZqGx8bl9d//GJBTnvhOV1y812/J3WNTYn5mc0dYrYsSeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBSyAKHWQr47zK3oBULBgd32IlKetVsZMORTnelBj6JfLAvIuUDd1dvmpHKqVmeNXn5DzufPgAgggAAC5SuwuT0qq+rtv+1xNaP2bNjS9VbB78b7Dibm+Nxjjzjh1sSBAtqINjTJr/zW74kGXCebVsnVqq1iV2+lIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUswG81C/nuMLeiF4jFjZvdi9jR3SzRioD7MPsIpAkEW9qlauPmtOO5PqDX0GvREEAAAQQQyKXAjqXNacOZdfaxQCjteDEcsAb7RWJjzlQHBwbsaq0/KthphyMRufH2O6Vn9ZrEHI3qegksXi5i8nNoAoUNBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoOAFCrQV3S5hQqQgEuz98lWnKWvd6tp5LlVa3CftTC2gFVbOmbuoOs3xHx6ZK6ywROR0BBBBAIKPA1q5GaY0EU9+zK4WatQ2px4poL378UGK2zz1auNVadZLBUFC27LxV1l9yaWLORmWVBJesEv1KQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBApRgFBrId4V5lQaAobc5l7IlYvrMj6K192PfQQmBYxgSKJX3DC5m/OvtZddJ3oNGgIIIIAAArkWqAwYcm1HeoDVbGjL9aXmbLz4iaMi8Zhzvb7ew/KTZ56cs2tne6HLrr9Brti67ezpwQqnYqtRwx9anUVhCwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEECkWAUGuh3AnmUVICFcvWrxAxtrgXdUNnerDD3Yd9BNwCVesukoqOc92HZ70fal8i1Rs2z3ocBkAAAQQQQGAqgd0rW9PfCoTEqK5PP14MR6y4xPsOJGb63GOFXa11cqLnX/BRp2prOBKZOGRXzA20dYnZ3DHZha8IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQQgQai2I28AkSk0gHjfSqrR211aKVmqlIZCNQN21n87mtGnPyceY016QNxFAAAEEyk6gNRLM+POPWZ8h7FokOk611tiYM9vBgYGiqNaqk+1ZvUa2f/a3pbF1QUJa70Ng0TIRM5A4xgYCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwnwKEWudTn2uXpkDn5rD9XNrd7sVt72lyH2IfAc8Cua6qGlm9MS/VXz0viI4IIIAAAmUjsGtZc9pajUitGJVVaceL4oBWaz1+KDHVYqnWqhPWQKsGWzXgOtn0XgSXrCre+zG5EL4igAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBJCBBqLYnbyCIKSSAUHLADrYYdbD3bKgOG7OhOD3Sc7cEWAjML1F52nZg1s6/2awRDUnfNtpkvSA8EEEAAAQRyILCptUZW1af8aOSMSrXWHOBmMUQ4EpEtO2+VSz5+3dmzgxUSWLxcjJrGs8fYQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBOZBgFDrPKBzydIWsERuc69QA60abKUhMBsBDbTWXnz1bIZwzq29NDfh2FlPhAEQQAABBMpG4DPLW9LWatQ0iARCaceL4kCGaq0jw8NFMfXJSf7CZVfItjvuFA25Os0wJdDWJYEFnSJmYLIbXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIE5FSDUOqfcXKzUBQLd67fYVVpXuNe5vafJfYh9BLISqL7wcgm1L8nqXD1Jg7E6Bg0BBBBAAIG5FLh2SYO0RoKpl7RDlGa0eH9Gip84KhIbc9Y0ODAgTz/8YOr6imCvo7tHbr7r96R14aLEbI3aJgme8yExqqKJY2wggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACcyVAqHWupLlOWQgYhnWze6FXLq6T7tpK92H2EchaIHr1tqzPrb/u02IEi7QqXtar5kQEEEAAgfkW0Ir1WzvTH21v1rfafw9UpP9J4qrW+pNnnhQNtxZbizY0yfbP/o6s3rjp7NTtCrqBhT1iNncU7/05uxq2EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgiARc5ZKKaOZMFYECEwj1bFhrWXG7Umtqu6k7PcCR2oM9BETip044VVS9WFR2LZfI6o0y/OrzXron+uh54ZXrEvvTbfiZz3Tj5Oq9YEu7s2YdL9TWIUa4SoJ2CCdQ35y4xPiRgxI7ZYeJxsdk5K09MrrvDRk7+F7i/VxtVK27SAINZ6873bix40dl6OWnpuvihIz1fuqYZk1Ugs3tYoYjTkXe0f3viGWvR+/HyGsvy+m9eyQ+MjTteDO9qWaRNRud6wab25zPXcC+rl5fr6dt/OhB51p6zflq+nmt6JoofF2x+FwRO4wdau+wbaok1n9Uxo8fc1zGjx5yjEb3vZ6Yfz7m7Oe+62dxpu9Pvef6uZ78PE+u7fBX7pZAbVQqe1Z7XoZ+Rk49kdsKkZPz8zoJ/azk4/vN6/XpVxwCtyxrka+/eUROx6yzE7bDk1oR1BrsP3usiLa0WqvZ0CZir2N8bFyesqu1XrE1+z8+ma+lB0NBuerGHbL8/A/LQ9/+ViKcq6FjvT/x3nfFGhmcr+lxXQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCMBQq1ldLNZan4FLCu9Suuq+rBsbufRrfmVL43RTz76gESv2ea5imqd3VdDZBpm89q8VnjVMXU+dXZV1/lsWlFWg3VVGzZLRYcdapyhaUBQX9oqe85zvjphUDsIquvRIGQuWsQOtWrg0ks7bQdrpwq16lyrL/iYs0YNamZqyetWC2163088eK/v9TiWOvczNpmuN7ku/Vptu+tnYej5x+XkYw/MOkyb6XruY15M9BwN5iYHmifH0fs99NJTMvjCD3z7TI4x1Ve/9z1TqNWsqZOqD19k2/5ixvlPXlsDu82XXje56+lrrkO90StumHaO7kmpOw2BmQRaI0Hn56LvvX8ipauGQmNFGmoVu1prrPc9CbR3O2v6yTM/lLUXXiytCxelrLFYdrpWrJJdv/t5+Y9vfUP2vf4zZ9pGRVgCi1dI/PghiR/7oFiWwjwRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBIBQi1FumNY9oFJtC5tt6utbnbfj5rysS2nkuV1hQQdqYU0MqYGh6svvDyKfskv6HhuFo79DbwyH3Jh6fc1nFD7UumfD/5DZ2Hzme+mgYbay6+2gl7arB1Ns0JEdpBTq2yqeHSXIZbvcwr0/x1Tg2/fJvnYKz7OlptV19amdPL/de1a0BRr+u36fz1sxOxg5gDdpB2qoCu33Hd/TVAW2UHPb1+Rt3nT+7rGmsuucZ5adXZwWcembFi6uS5+f6qjnVXe6veqOFcrco7XQDZPd/winU5q1Sr9yFTaNh9zcl9tc5VaHxyTL6WrsBvnNcm7lCrEa4Wo7JKrNOzq0Q9X2paZdYaGnAqmuocHvrnf5Cb7/rcfE1n1tcNRyKy9VfvkD0vPCuP3X+fjAwPO2Nq+NiI1E5UbR0dmfV1GAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDIJGBmOsgxBBDwJxAKmjvtQGs4+axoRUB2dHt7RHnyeWyXr4AGFDXM5rVpSM5L8EyrgNbaIVEvTa/vJSjpZaxs+mhYs/nW33dCqJkCodmMOXmOhjsX3PVlz8HhyfNm81UfI5/cdH2td/5x1oHW5LE0vFn/yV3Jh1K2NeDZ9Cu/6fTJJtCaPJh+hvRaes1cNh1X56hVgWcbaHXPS6vcani48aZfl6kq4brPyde+rs9roHVyDn4DxJE1E5V8J8+fzVf9nPppw3ZgnIaAVwGtYq8vdzMbFrgPFdV+/Oh+0aqt2noPfCBasbXY23kbNjlVW7V662TTAHKgY6VowJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5EOAUGs+VBmz7ATilnGne9Gf6myUykBq5VZ3H/YRSBbQR73rY+W9Ng191l0zc+VHrejqNdSolUx1HvPRai+7fk4CiBou1JDhXDe9Zq4DlhrUzRSW1ONOeLbnvJwuM3r5DU7gOBeDaoi15de+4KsaaTbXnQxK5zo062Uu+j2qoV2tROu3jbz2sq+QuwbcNcibi1bRudzzMPrvxbA9VxoCfgRuX5keYDWq7aL3gdlV5/Yzh1z3tezKpfH+3sSwTz38oAwODCT2i3WjOhp1qrZe8vHrJBg685APwxSzaZEEFi0r6ntWrPeEeSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ6gKEWkv9DrO+vAsEutdvMQzpdF9o98pW9yH2EZhRYPjV5309QlwDe9NVVdQgX9XGzTNeVzuMHXxPBl943FPfXHbSKpoa9tTw7Vw1DRlq2DDX1WCnmr+GQbMJNk41XvJxrdhb2XU2hKjhYK2qmq/qpFE7SO01JJ08z+RttdCKvF4qDSefl+12sKU9UQE42zH8nDf5udLKtpVZBoudsOgrz/u5rIRX+Kuwmmlw/dwkf54y9Uk+NrrvDV/h2+Rz2S5fgSsX10lr5ExAcpJBg5J1LZN7Rfk13ndQJDbxhyEaaNVga6m0X7jsCrn5tz4nC8/pSizJiNRK8JwPiRltShxjAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJitAKHW2QpyftkLGEY8rUprxrBG2UsB4FXgxHe/6bWr008Dk5MhOveJ073n7uv3uu7zs93XR8RPF8zNdtyZztOw4VxUbNXQqYYb89k0xKqfAa3amu9wsIYe62dR6dapLmufP9VnNl9Oej110uvnuwVqo05gt+biq2d1qaEfP+3r/Miajb76Z+rs93tRg/g0BPwKaCX7m7qb004z6+xjdri1aJsVl1iNSsl3AABAAElEQVTve4np/+SZH0rvgQ8S+8W+0di6QLZ/9i7Rqq2JZgbEbO2UwMIeEXubhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMxWoIh/azzbpXM+ArMXCPVsWGv/Bn+ze6Rdy9KDGu4+7CMwlYDfiqlahVKDk+6m4TSvVSL1Ueej+99xD5H3fQ0Yep1jPiaj189nyFErmmrQNN9NK55q5dNMn4N8XFs/W9lUa9V5aqXX+Wx6/bmoEFtnX2e2wV39t0BfXpuuq6LjXK/dM/bz8/3oVJMl1JrRkYMzC2zvbhINt6a0QEiM6vqUQ8W2Yw32izU0kJj2I9+5N7FdKhtatVXDrRpynWxGVdSp2qpfaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMBsBAi1zkaPc8teIB6P3+ZGWFUflk2tNe7D7CPgS+Dkow9IfGTI8zlanTM5ZKhhOq3S6qVpMK3/AX/VYb2MO1MfDeBlUyk11n9UtDrkyccekGP3/JnzOvHQvTL49CO+AoCT89M5aDA4H222oUY/cwq1L/HTfdZ9ay74mK8x1EKr8mql1/lsen2dRz7vjX62/VY8ncpk6IUfTPVWxuPhFesyHvdyUE38hFo1DK//ftAQyEagNRKU685pSDvVbDgblEx7s0gOxI/uF7Grtmo78PN98tyjDxfJzL1Pc+E5XXLzb31ONOCaaHYoWSu2Btq6xAhVJA6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCPgRCPrpTF8EEEgS6NzQZhixnfZzcpMOiuxYSpXWFBB2shKInzrhhDa9VvnUMJpWhjz+L3/rXK9q42bPQc1TTz4ker25bvooeD/BQg3PaXBVw6zuIN3pfW8kpq+VV7Uap9fwpM5BA8B9//TXiTHmckMrccZHhhOX1EqbflwSJ3rcSLbS62Rb2VMfdT/wyH0erypSc8k1WV9Lw5Njh/ZL7PhR0VBz0A7w6v2t6FwulV3LPc9hsqOuWedz8tH7Jw8V7FcNcOvn2etnwu99SV64unj9vtHzhl5+Kvl0thHwLbBrWYv833f6Us4zKqvECFeLNTKYcryYdqzREYn3HRSzaZEz7acfflB6zjs/pbJpMa1nqrkGQ0G55OPXSdeKlfLdb31DBgcmKtQaNY0SsCvuxvt7JX78kEg8NtUQHEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTSBAi1ppFwAAFvAqFgbJtlxy6Se2vVsa1djcmH2EYgawENcFatvVC8VuCMrN4og888IuPHj3mu0qph1lNPPJj1HLM9sXrDZl9hRJ3n0Xv+p4wfOTjjJTVoN7J3jzTv+h3PwV6tTqmVbucq3KtB1sFnvp8xFKihwuoLLxetvpurpiZDLzwuo/vfSRtSq9TWXftpX/dDB9FqpDpXLxWF9TOczXp0vie++820CrzJwVwNYur8vX6fTALofJywrH0v5rJpIFuvO370kPN5nvzMjR05lPHzp77aX7+/vTS9L2qS6V7PdL6fKq8679GkMPlMY/M+ApkEtLr9uqZqeflYaoDVbGiT2MG3M51SNMc0zGlURcWI1Mr42Lh895v3yPbP2v+/ZAdBS611dPfIrt/9vDzxn/fLT5754cTyDFP0PprRJifgGz9xpNSWzXoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBPAmaexmVYBEpewBK5273IrZ2NUhlIrdzq7sM+An4EBh661093J9wXveIGz1UdTzx4b1rVU18XzLJz2GNAb3L44/96j6dA62R/Ddwd/79fm9yd8atWway9+OoZ+822gwYatZruka9+MWOgVcfXEKNWED3xwDdnezk5bYd7D335t6Tf9psq5KhB4b5/+F+SHBT1euGQHZ700jSc7bdpkPPo1/57WqDVPY6uSz11rX5bNvPye43J/k5A176nh+37oZ8BvcdahVXd9TUZbp3sn/xVA8l+mp9wavK44VXrknen3R6274+7YvK0J/AmAlMI3L6yJe0dDYOK/Sj7Ym/x3ndFrLizjN4DH8hzjz5c7Euacv7hSESu2LpNbr7rc6Ih10Sz76PZskQCHSudkG/iOBsIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDAFAKEWqeA4TAC0wkEutdvETHakvtomPUW+zG6NARyKaBhNw33eW1arbJq3UWeumvITkN1c920IqqfR8ZrldFsAotONVS72q3XVrVxs+cwsNcxk/tpWPXYPX/m2XzQDjJ6qUybfI3JbQ0bDjxynxyzw6rThSVT+vsMUOu5WuXVS4us8VZldHIsXbcGP/007R/rP+rnFPE7L1+Dn+msa9HQrQZ09Z56qWzrvo7+O+BnbdmsSyu86strG5mHfzu8zo1+xSWwuT0qWuk+pWmVz/rWlEPFuGONjUr86PuJqT//2MOi4dZSbq0LF8m2O+6ULTtvlbrGpsRSjcoqCSzscV5GRcqDDhJ92EAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRUgFArnwMEshNIq9KaMZSR3dichUCKQL9d3TEfFRH1se7z0SIrvVeD1PkN2NVks20a7PTatFqrPrY9H00DiUe/9idTVkud6prDe16Y6q0pj+tnpe+f/lpOPfHglH0yvaEhYA1P+mlmuGrG7pU954kGmf00rbLr9zOvYdG+f/obP5dx5qXzy1fTMLbed7WdbRt6+WnPQ2g41e9n2U+VVv08+/2seJ48HctOYKo/CtLH1osdbi32Fj9xRKzhk84yxsfG5aF//gfRr6XeelavkVv/4Ity2fU3iFZxnWxahTew5ENiNneURDXeyXXxFQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEcidQ/L8pzp0FIyHgSSDUs2GtYcgmd+dMj89192EfgWwEtNLmqScfyubUKc/R6qe5CNpNeYFp3giv9l61U+eYTWXLyctrMNJPtdOKrhWTp+bsq67hyN/+ia95TF7cz9z1nMlAazaVbfX88aMH9YvnZtbYjwifoUV83G8dSisIZ/vZ1PP8nut3fjMsN/G2rkPDxbP5/CYGszeGXnoqeXfG7fAKf+FxP/2HX5n7Cs8zLpgORS2wtbNRNNya0vSx9bUNKYeKdSfe+65IPOZMXyu1Pv2wvz86KNZ167zXX3Kp/Oof/JHzNXkdWok3eI4dbm2wH3xQAuHl5LWxjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACsxMg1Do7P84uQ4F4PH6ne9mr6sOyrqnafZh9BHImoFU3/Tx+fLoLa/Bx4GHvFUynG8vve1qxs7JruefTRn1WDs008PjRQ5kOZzxW0el9bhkHyHBQq45qMDmb5mfuOr5eJ9tAq57v93pa3Xa6pu/7DY2eenJ2Ya/BZ74/3ZTS3tP5zbSOtJM8HOj/t3t8V5udblinOqpd+dVri6zxHh7Xirt+KrsO/dh71Viv86VfeQu0RoKytasxDcGoa007VowHrLFRiR/dn5j6c48+LHtffSWxX+obWqlVK7be9odfFK3gmmhmQMymRU641aiuTxxmAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEyluAUGt5339W71egc0ObYVjb3Kd9ZnmL+xD7CORUQIOoJx68NydjDjxyX9Yhy9lOwE9wTq81duhsCCjba/updqrzy3XA0Roby3bqEjt+NOtzszkx19fLxnM2oVxd88hrL/taut5vv59LLxeYzX2fanytsOy1BeqbPa+rsuc8z597/X7y8z3ldb70Q2DH0uY0BKOySoxIbdrxYjwQHzgm1mB/Yur/9e1vysDxY4n9ctiINjTJlp23yrY77pTWhYvOLjlYIYH2bgksWiZGmD8SOwvDFgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALlKRAsz2WzagSyEwgG4zvtZ6SGk8/W6mLXLimNx+Mmr4vtwhPQsJ4G/jSAlm3Tao9Dzz+e7emzPi9gV2r106ov+JhE1l3k55S0voGaaNqxqQ5owFGryeaqKu5U1/F6PFePrvd6PWtkyGtXT/3U0k/TsKQGuGfT1EzHCba0ex7G7zw9D5zjjvpvgK5PK6t6aeEV62R0/zszdvXzbwpVWmfkpEOWAlr1flNrjTzbeyplBH1MfWz4ZMqxYt2JHX5XgktWidghzpHhYfm3e74m2z/7OxIMldd/knV098jNd31OfvLMD+Wphx+UwYEB55ZqgDmweIVYp/okduR9kdjs/v+gWD8nzBsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcor9+glvvdZv2zE+jcbIdZB+60Q60p49zU3SyVgdRjKR3YQSCHAlpltblrueeqiu5La7XX2YYG3WP62Tdr/YUcQ+1L/Ayfk75Bu5JcoYRac7KgeRzE7/0eP3ooJ7PVCr++Qq0+P5c5mWQWg+j37vBLT0n1hZd7OjuyZqPovxkztfDKdTN1Sbw//MrziW02EMi1wK5lzWmhVuex9IFQaQQc4zGJHXrHqUgqhim9Bz6Q7//rvXLVjTtyTVkU451/wUflQx/ZJM89+rA8/9jDMj427szbqGmUoP2yTh6T+PFDYo2OFMV6mCQCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORGwMzNMIyCQOkLBAKnttiB1rbklWqYdXt3U/IhthHIq8DYwfeyrrR6et8bvh/NnuvFmD6qpub62ow39wKBau9VcnV2uXqsvd9wrN95zr3k2Sv6qZQaqG+Wio5zz56cYavSDsl7rfyq/4YQ+M6AyKGcCVy5uE60Ar67mQ0pP3653y6qfWtkUOLHPkjM+dXnn5U9Lzyb2C+3Da1Se9GV18itv/9HsnLd+pTlG7VNEljyIQm0d4tWcaUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB5CBBqLY/7zCpzIGBZ1t3uYaYKX7j7sY9ALgW08mL81AnfQw48dK/vc3J9QsDn4+hzfX0v42kQkJYbAb+VWq3YRJW+2V49ftLf94ffec52frM5X4Pt+vLawiumr8Ja0bXC61Ay8ipVWj1j0TFrgd0rW9PONWsb7L8rKp3/bIn394o12J9Y5yP33St9vYcT++W4UR2Nyid23CI33/U56ejuSSHQar2BRcsksHiFOJV7U95lBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESk2gdH47XGp3hvUUlECoZ8Na05S17kndvnKB+xD7CORdQB9BPvDwzI8UT57I4NOP+ArCJZ+by22zCEKtuVxvuY/lNyAcO340J2T6PeKn+Z2nn7Hz0ddPtdbImo3TTiG8Iu3/2jL2V9NhQq0ZbTiYW4FPdTaKVsJPaYGQmHWl9QcHscPvijU64ixzfGxc/u0bXxP9Wu6tdeEi2XbHnU64tWf1mhQOI1ztVG0Ndq4WM2o/KaGEgs4pC2UHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTIXINRa5h8Alu9VIJZWpXVVfVj0RUNgPgSGXn7Kc0g1PjIkJ598aD6mmXbNQK2/x9GnDcCBohIINtihoyJoxTLPScrhl54Sr8FdDexWdJw7eWrKV30v1L4k5dhUO6P73hD9t4SGQL4FohUB2dGdHmDVR9GXVIvHJH7obREr7ixLK7X+xz/eU1JLnM1iNNy6ZeetctsfflFWb9wkwVDw7HDBCjFbOyXYeZ6YDW0iZuDse2whgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDRCxBqLfpbyALyLtC5oc2yZIv7Or9xnv1LdBoC8yjQ/2/f8HT1k489IPFT/h7H7mngLDqNHz+WxVmcUqwCsZMDRTH1YpnnJKaGS0dee3lyd8av4RXrMvYJ95yX8XimgxqkpyEwVwJbz21Mu5RRWSVGVWn9YYRWao0feS+x1r2vviKP3u+vEnvi5BLdiNp/HHHVjTvk1t//I/mFy66QcCRydqVawbdpkR1utSu3NneI2Ps0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofoGkkjfFvxhWgEA+BIJBa7f9fNOUkqytkaBsbi+tYEU+7BgzvwJjB98TDZpVrbtoygs5fZ5/fMr35/oNy2elx9H973iuSJmrtcQKJACcq/XM5zh+w9RGpGpeput3nvMySddF9Xs/snqj62jm3ciajTLwSHpQLrwyc9jVPYpWhfUTonWfzz4CfgW0Er7+nPX4wdRgvFnXIrGh1GN+xy60/vEB+489KqrErG91pvbiE49JU+sCOf+CjxbaVOd1PtXRqFzy8etk42WXy09feFaee+wRGRw481mwK7Wqn76sk8ckfvyQaGCYhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACxSlAqLU47xuzniuBzs12mHXgNjvUmnLFW5a1SGUg9VhKB3YQmCOB/n+9R/RVLM1vYPTEd78pGsylFaeA3/tthnMTag00pD+6fDpBv/Ocbqy5eu/03j1OBWazpm7GSwbqm6Wi41zRkPhkM4IhqehaPrk77dfhV5+f83D5tBPizbIQuKm7MS3UalTXixGqEGtstKQM4kf3O+vS9Wl79P7vSLShUbpWrCqpdeZiMVqpdf0llzqvPXa49emHH5ITfWerwBu1TRKwX9Zgv8T7e8UaPpmLyzIGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjMoYA5h9fiUggUnUAoNLDNDrS2JU9cw6xbO9Mfi5vch20EEMgsED/lr8KeWUNF5MySxXF0vu63WZn0eGoPVH7n6WHIOeky+MIPPF8nvCK1Kmtlz3miwVYvTUOtNATmWuDKxXXSUVORdlmjbkHasVI4EDu0T6zTQ85SxsfG5T++9Q3p6z1cCkvL2xrO27BJbv2DL8qWnbfKwnO6Uq6jAeHAomUSWLxCjBp+bk/BYQcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAhcg1FrgN4jpza9ALGbc6Z7Bdec0SGuEIsduF/YR8CIQO37US7dEn4CHKpSJzmwUnIDf+x1qW5KTNQTb/Y3jd545mWQOBhl66SnPo0TWbEzpq6FWLy1+6oRoVVgaAvMhcMuy9KrLZrTJ/nujEvxPGCsusQNviYxPVKEdGR6W7/zdV0W/0qYX6Fm9RrZ/9i7ZdsedotvJzQhXS6CtS4LnrhWzZYkYlbmpCJ58DbYRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AiX4G+HcAjFa+QqEln54k2nKWrfArmUt7kPsI4CAR4H4yEQVOo/dJdTW4bUr/QpQIGYHIv20YEtKYWw/p6b0reg4N2V/ph2/85xpvLl6P9Z/VE7ve8PT5QL1zZLsEvYYah1+7WVP49MJgXwI7OhuFq2Qn9LMgJh16WHXlD7FuhMbk9jBt0XiMWcFJ/qOyX12sFUrt9JmFujo7nGqtn7m7s/L6o2bUk9wPjctEuhYKYElHxKzoU2MUHol4NST2EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgfkQINQ6H+pcsygELJG73RNd11Qtq+rD7sPsI4CAR4HR/e947DnRraJrua/+dC4sgVj/MV8TMsNVouHL2TQNbhrBkK8hxo8c8tW/kDoPvfC45+mEV6xz+obsSrZenYdf9l4N1vNE6IiARwENtGqw1d2MaOn+gZF1ekhih87+f+WBn++Tf///vuYmYH8agcbWBXLVjTvkji/8ifzCZVdIdTSa0tuoCIvZtEgC56yWwKJl4lT/tUOvNAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAwBnqFeGPeBWRSaQOcGu1xg7Cr3tG5fWbohCvda2UcgHwLOo8ztypKVHsOqGr4za+pEz8u21V29TaovvNzz6Ue/9t/Fb/jW8+Bl1nH8yEHHMrlC6EwENRd8TE48dO9M3aZ8P7Luoinfy/SG3muteFqsbcSupKoVkDUQPFOLrNkoA4/cJ+GVE+HWmfqrC98LMynxfr4Ftvc0ydffPJJyGQ0lGtX1Yg32pxwvlR1raEDiR94Ts2WJs6R9r/9M/uvb33KCmqWyxrlYh4ZZL/n4dc5LDd/4yUvy2ss/Sql8a0RqRV9mS1ysU8clfrJP1J+GAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzJ0Cl1vmz58oFLBAMWnaVViOlJGtrJChXLq4r4FkzNQSKQ2Dk1ed9TbT24qt99Xd3Dra0uw9Nu1/MVTunXdg8vem30mfVxs2eApqZlqPBziqfoVa/88t03fk8Zo2PybDH7ymtzqoB48ql53ma8tDLT3vqRycE8inQXVuZ8ecvp7pmPi88z2PHTxyReN/BxCxeff5Zefg72Qf+EwOV6UbXilVOKPi/fenP5eptO0T3U5philHbJIGFPRLsWiNmc4doeJqGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzL0Code7NuWKhC3Rutn+DHd/mnubula3uQ+wjgEAWAsN2ZUkN4nltGnL0G0xNHjtkh/i8Nq0Iq1UvabkT8Hu/jWBIai+9LqsJ6Hl6vtfmBELtz2Oxt6EXfuB5CVUbNjvBVi8neA3LehmLPgjMRuCGzoa007VSa6mHDuN9ByTe35tY+0+e+aE89b0HE/ts+BcIhoJy3oZNsvVX75A7vvAnctn1N0hj64LUgQIhMetbJbDkQ85Lt8U+RkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbkRINQ6N85cpYgEQqGTO+0qrW3JU64MGHJtR3qgIrkP2wgg4E1Ag6Oj+97w1tnupSHFhk/dmlX1ztrLrvd13tjB/Z7nRUdvAn7vt45afeHlElm90dsFzvSqtsOaep6fpp9DnV+xt7GD74m+vDSvlWx1vPEjZ6tEehmbPgjkS0Ar5a+qT6+aaURb8nXJghk3fnS/WCePJebz9MMPyotPPJbYZyN7gepoVNZfcql85u7POy/d1mPJTYPTWrVVq7dqFVejptH+wYT/hE42YhsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBXAvwG7lcizJe0QvEYnKbexFbuxqlNRJ0H2YfAQSyFBh6+SlfZ4bal0jDL9/mqwpnZddy3xU/vQYDfU2ezuL3fitZ/Sd3iQZVvTQNs0avSSuwPeOp2cxrxkHnqcPQj5/O6ZWH97yQ0/EYDIHZCmw91w4TuppZ11wWAcPY4XfFGhpIrP7R+++TPS88m9hnY/YCWq1Vq7Zq9Vat4rp64ybRqq7JzaiKSqCtS4Lnni+B9m4x6+xQNRVck4nYRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCAnAoRac8LIIKUiEFz6kc2mKWvd69mx1A5N0BBAIGcC+ljz0z6qteqFK3vOk9Y7/1hmqjRp1tRJ3dXbpPFXftPXfPVR9Kee+b6vc+jsTSCb+60Veuuu+7S03PEFCa9cJ8GW9pSLadBZq7m2fvaPnfut/f00/fzpvEqlDb/yvOhnOFdt6CV/wfNcXZdxEJhKYEd3s0QrAqlv2xUznWBr6tGS3IsdfDsl2PrQvd8i2JqnO921YpVcdeMO+W9f+nO5etsO0f2UZn/ujOp6MVuWTFRw7VgpZuNCMSqrUrqxgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC2Qmklp/JbgzOQqCEBOJ32iW/UtazuT2a8ZG3KZ3YQQAB3wLH/+VvnZCqGfYeAgnUNzsVPGsvu07Gjx+T0XffSFzXrIlKsLldKjrO9VXRdXKAoecfL4lH0U+up9C+9v/rPdLya18QP/db16Dh1cabfj2ny4mPDInOp5Ra/NQJOb13jxMAnu26NPCr49EQKCSByoAhn+pslK+/eSRlWmb9Aon396YcK8kdKy6xQ+9IYNGyRHhSg63aztuwqSSXPN+L0kqtaquvwYEBef3HL8qPn/mh9PUeTpmahlmdQGuj/ccXsTH7389+J4DsVNe17xsNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8CRBq9edF71IW6NzQJhLf4l7iTd3pj7t192EfAQT8C2ho7sQD35SGX77N98kabtVXZddy3+dmOkErXJ588qFMb3EsRwKx/qNOkDTXAdVspqeBVp1PqbXBFx7PSah1+GWqtJbaZ6NU1rNreXNaqFWCFU7VTGuwv1SWOfU64jGJffCmBBavEKMi7PQj2Do1Vy7fqY5GZf0llzqvAfuPava++ors3fOK7H97b+plAiG7enCLiL7sQKs1fMp+nZx4jQym9mUPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQyCqSWpMzYhYMIlIdAcOlH/pf92+ffSF5tR02F/PATK5MPsY2AJ4EFd33ZCV166mx3OnbPn4lWRyyUpmHRpl2/62s6h79yd1ZBwfpP7pKqdRf5ulauO5964kEZeOQ+T8Oqi58wbbYuk5NZ+KWvT27O+FWDmnq9bJvf+z5khx/9Vjytu+7TUr1hc7ZTnPV5GvzUMLXfNtf33e/8Jvu33f0XYtbUTe76/qoB78Nf/i3RarY0BApR4NYfvivfez+1krBWxIwdcIULC3HyuZqTGUip2KrDXr1tBxVbc+XrY5yR4WHZ9/rP5O2fvuJ81f0pGyHXKWmmesOyrKne4jgCCCCAAAII5FDAMPgVQQ45C3oofr4q6NvD5BBAAAEESkignH6+4ueLEvrgshQEEEAAgYIWKJefL4JL1yfuA5VaExRslLVA52a73NXJnW6DW5Y1uw+xjwACORYYePDenFZd9Ts9DWZ6DbT6HZv+6QLO/bZDl+GV69LfzPORkddeFr1+Kbehl56SmkuuyXqJp/fuIdCatR4nzoXALvtnM3eo1aiKOo9/t06XSRh7smLromUTj7234anYOhefvvRrhCMRWbluvfPSdzXgqhVc333jNTnRdyz1BMMU57Nqf16dRsg11Yc9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBM4IEGrlo4CALRAKDuy2xKhPxqgMGPKpzsbkQ2wjgEAeBLQipFaqrb3seqm99Lo8XGHqIbOpNDr1aLzjRUArgfb90187wcvo5Td4OWXWffSaGlwefPqRWY9V6AMMvvCDWYVah199vtCXyPzKXGBTa42sqg/Lz/pHUiSM2iYpm1CrrnyKYOuJvj656Mrsg+0pqOz4FuhasUr0pa2v97Dsf3uv83rP/jo4MJA6njvkar9rDZ8Ua2Qw8ZLYWOo57CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQBgKEWsvgJrPEmQViceNm00ztt6O7WaIVgdSD7CGAQN4ETj56v4wdfE/qP7lLzHBV3q6jA2vIUQOOVGjNK/O0g5964kEZ3fe6NN7062LalVvz1WL9R+X4v/ytjO5/J1+XKKhxdb2n970hlV3Lfc9LA+ZazZaGQKEL7FjaLL//o/dTpmnWNUv8+CEpqxBghmDr0w8/6LgQbE35eMzLTmPrAtHX+Rd81Ln+jCFXu5cRqXVeiQnbodbkkKuGXmkIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLoAodZSv8Osb0aBYPeHrxJD1ro7bu9pch9iHwEE8iyggbojf/NFqbtmW94eT6+VKE/Yj6CPnzqR59Uw/EwCGjTtPXO/I6s3ztTd1/saXNZKvCcfvk80rFlObdhedzahVv3+UzcaAoUusLWrUf7yp4ekd3j87FTtqpdmbYPE+3vPHiuHLQ22vv+GBNq7nUfb65I12Hp6ZFguu35uqmGXA3Mu1jhdyPXAz9+VE33H0i8TCIlRXe+8Jt/UkKuMjYg1ar+0qqv9tazC3JMQfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWZhnAUNuc/e9cnGddNdWug+zjwACcyCglSb18fRavTOyZqNUf+QXJdjSPqsra4BVq1eeevIhpxrsrAbj5JwK6L3RSqonHvimaLC1+oKPzep+a1BWQ50aXi63MOvkjdG1R+1guN+Kx3oeDYFiEKgMGHJtR4N8/c0jKdM1G9rKL9SqAlZcYgffTgm2vvjEYzJw/Jh8YvsuCYb4T76UD0qB7LhDroMDA6Lh1oPvvWt/3Sf7396bcaZGuFrEfhnJ79rhZuv0kP0aTgm86mej1Npvf/+xUluSs54//9ilaetirWkkRXeA+8r3a9F9aF0T5jNcPp9h161nt8wE+Jmj+G84/16Xz7/XfL/y/VpMAuX+b1Mx3at8zPXA5z+Tj2HnfcyFX/p62hxYaxpJ0R0o9/vKzxdF95FNm3C5/39uOX2G025+GRzgN5xlcJNZ4tQCFcvWr4jHZYu7xw2dDe5D7CPgS+DUM9/3FSgbt4MnhdR0Picfe8DXlKwcV8PUsOPg0484Lw21auCxYvG5YqdzJNTSNu0j68ePHJRY/zEZeWuP/Yj7N3IaZNXA5Oi7b3i2ma2Ln/sw2xCn3/s+dvA9zw4zddS5D77wuPMKtS9xKvVWdC53TpvqfmsA2vnesauLjr5vh1ntUKbe+3y0ub7vs1mDVlsds8O9lT3neR7GCX7v3eO5Px0RmG+B3Stb00KtcqaqpTXYP9/Tm/vrTwZb27oSVT33vvqKfPv//JXc8Kt3SDgSmfs5cUVfAtXRqPSsXuO8Jk/sPfCBHLQDrhp0/eDdfdLXe3jyrdSvZkCMSK3zSn5jsoqrE3bV4OvIKRHnq13tlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAgQoQai3QG8O05kYgHjfsKq1WysW0QqtWaqUhMBsBDWMWc9Ow4MlH7y+YJWhQMdN8tJqrBh61jR05JBrMy3fTR9rPZcu07nxdv1Duu4ZlMwVmDTvQXNFxrlN1N18GU4071/d9qnl4Oa7fFxVdE4FgL/21z/ArVGn1akW/whBojQSdn9e+937qv/tmfavEyjHUqrflTLDVbO4QddCmFT//4S/+VLbawVatDEorLoHWhYtEX+df8FFn4iPDw3LkwPvS+8H7cuJ4n/P14Hv7ZHxsPOPCjIqwfTycFnZ1Op+p7irjo2KN2a/REZHY2ETo1a76SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgvgQItc6XPNedf4HOzfZveQd2S+rDO2V7T9P8z40ZIICAJwGnuuQcBFk9TYZOeRfQCqSn7cq7tOkFIms2igaA/bShHz/tpzt9ESgIgdtXtog71OpUq6ysch7FXhCTnIdJxI/uFxk/LRpu1Xai75j84//+c6di68JzuuZhRlwyVwJacbeju8d5JY85ODAgfUcOy3tv7ZWT/X1y7PBhJ9Cc3Cdt+0x1Vz1upL155oATeD3t7ExWe9XwtDVyttKrpQFYOyBLQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBXAoRacyXJOEUnEAoO7LbE0PJFiVYZMGRHd3Ninw0EEEAAAQSKSUDDrLUXX+1ryloJOVNlXF+D0BmBeRBY11Qtq+rD8rN+u8JkUjOizWIdeS/pSPltxvt7neqbgTY7xGqYohU+v/1//kquuvHTsnLd+vIDKfEVV0ejoi8NvCa3gePH7FDzREVX/QwcfO9du6rrmOx/e29yt6m3gxX2H0lUOO9rYNxTSwrCOv2dirDDnk7VTomKsZ7PoCMCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECpCRBqLbU7yno8C1git7k7a6BVg600BBBAAAEEilGg+sLLxayp8zX1wR/9wFd/OiNQSAKfWd4idz1nVyZNama0SeJ9BycepZ50vNw2rcF+iX3wpgQW2kFHuyqnPqL+P771DRk6OSDrL7m03DjKcr3RhibRlzvsqhgacj1y4H05pVVeew/LcbvSq25rO/jePufz4uz4/Z+kIOzkqUZ1/eQmXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCYUYBQ64xEdChFgUD3+i32ula417ZrOVVa3SbsI4AAAggUh0D1hs0SvfwGX5O1xsdk+KWnfJ1DZwQKSeDaJQ3y5VcOSu/w+Nlp2ZVJnWDr8UNnj5Xplj4mPrb/Z2K294hRMfGAgkfvv8+p2KlVW4Mh/nOwTD8aEo5EMoZdkz00CK0BV22T4VfdTg7A6r6GYzUkS0MAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgFwL8FjMXioxRdAKGYd1sP4s1Zd5XLq6TxdUTj9dMeYMdBBBAAAEEClTACIYkvHKdVK27SCp7zvM9y6HnH5f4yJDv8zgBgUIR0Ar7Wzsb5auv9aZMyaxvlXi/fcyKpxwvxx1rbFRi77/uVGw1wtUOwWsvvyjHDh+SX9p1q1PJsxxdWPPMAhp6zlTldaYzB44fkxN9fYluI0PD0msHX720px9+0Es3+iCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSwAKHWEr65LC2zQKhnw1rLimul1pR2U3djyj47CCCAAAIIFIpA/Sd3OcHVXM9n8Ec/yPWQjIfAnAvcsqxFvv7mETkds85eOxASoyoq1mD/2WPlvBWPSeyDN8VsXixmXYsj0XvgA/n7r/ypbLnl1qyCi+XMydqnF4g2NKWFpXtWr5n+pDPvEmr1xEQnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKCkBcySXh2LQyCDgGVpldbUtqo+LJvbo6kH2UMAAQQQQKCEBUZee1nGjxws4RWytHIRaI0ERSvuu5vZ0OY+VN77dtXa+JH3nNdkBVt9ZPy9X/0refGJx8rbhtUjgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDBCBBqLZhbwUTmRKBzbb1IfLf7WlvPpUqr24R9BBBAAIHSFbDGx+TEg/eW7gJZWdkJ3L5yQdqajXC1GJVVacfL/UD8xBGnaqvExhIUj95/n/zHt74h42PjiWNsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS55rwJhILmThEjnDwBre61o7s5+RDbCCCAAAIIlLTAgB1ojfUfLek1srjyEtCq+/pyN7MhPezq7lOO+9bIoIz//KdiDZ9MLP+1l1+Uf/zf/1P6eg8njrGBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIzLUAoda5Fud68yoQt4w73RO4tqNBKgOG+zD7CCCAAAIIlKTA6X1vyOALj5fk2lhUeQtkrNZabRfpD4TKG2aq1cdjTsXWeH9vokfvgQ/k7//if8hzjz6cOMYGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggMJcChFrnUptrzatAoHv9FsOQTvckdq9sdR9iHwEEEEAAgZIUGN3/jhz/p78uybWxKASuXFwnWoE/pRmmmHUtKYfYSRWIH90vsUP7RKy488b42Lg88Z8P2FVbvyIDx4+ldmYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLMAodY8AzN84QgYRjytSmvG8EPhTJmZIIAAAgggkDOB03v3yLF7/kziI0M5G5OBECgkAa28f1N3c9qUzDr7mB1upU0tYJ3qk9j+18QaGUx0OvDzffL3X/lT2fPCs4ljbCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAL5FuC3u/kWZvyCEAj1bFgrYm52T2bXsvTgg7sP+wgggAACCBSzQPzUCTn52APSZ1dotcbHinkpzB2BGQW2dzeJhltTWiAkRnV9yiF20gWs0RGJvf+6xI99kHhzZHhYHrr3W/Kdv/uqDA4MJI6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORLgFBrvmQZt6AE4vH4be4JraoPy6bWGvdh9hFAAAEEECgJgbGD78mJh+6Vw1+5W04+ej+B1pK4qyxiJoHWSFCuO6chrZvZsCDtGAcyC8SPH3LCrRpynWz7Xv+Z3PNnXxL9SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTyKRDM5+CMjUBBCHRuaDOM2E77ubMp09mxlCqtKSDsIIAAAggUrMD40UNyet8b085vbP/bYsXGZXTf6zK6/53/n717gY7zrA+E/5/RyJKsSLJ8kRXHiZXIduzEBjmOib1fkmOHJYTuts3ZQpsDOYeQAnbYstn2Y3GANpSm3GkPu92FLaV8sAvb9OuN0pa22R74FtoCZTmBJlwWcO5xEidxHDuOnVia+eaZIFkzI8eyNJLm8nvOmcz7PO/7PpffO1ZGmv/8X0GsL6plZzML3Lh+RfzRPQfLlpjpWBzpUXju2bJ2lakFCsePxtiD34/s0rMj2z9YOihlbU0ZWze/bHvs/Jmfi86urqlP1kqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFZCAhqnQWeUxtDoD03dl0hMp2TZ5uyeL36/KWTm2wTIECAAIG6FXjmK1+M9FAIEDi9QMrGv2VZd9z55NGyg1OA5tgj+8raVF5EoJCP/JMPR+HooWg7eziirb108F3/9PW4p5ix9aevf0OcO7zuRTqwiwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmcukD3zU5xBoLEEChF7K2f86qGl0dFWnrm18hh1AgQIECBAgACBxhS4aeOKqolnFvdOBGZW7dRwSoGUtXX0/u9G4ciTE8ccPXw4bv/Yf4yv/NUXYvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxWQFDrbAWdX9cCbcNbr43IvHDP1J/MNAWzvqF4W1qFAAECBAgQIECgOQV2nt0bKTN/WclkI7tkoKxJZZoC+bEYe+y+GNv/o4ixExMnfeNLd8Rnfvv9sf/+eyfabBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA2AoJaZ6Pn3EYQqMrSOmWQQyOsxBwJECBAgAABAgSmJZC+xLRnY3UAa7Z3WfH7Tn4FmhbiFAcVnj38QtbW4vN4OXjgsfjcf/qtSAGuCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGYr4BPd2Qo6v24F2tdtG8lkYnvlBKe6HW3lMeoECBAgQIAAAQKNLfDT5/ZHCm4tK23tke3pL2tSOUOBlLW1mLE1f+C+iOL2ePnKX32hFNyaglwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIzFRAUOtM5ZxX9wL5fP7mykletKQztizrrmxWJ0CAAAECBAgQaDKBga5cvPr8pVWryvRVZ3CtOkjDaQXyh598IWvrsSMTx+6//974/Q/eFl/68z+Jo4dPZnOdOMAGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIETiMgqPU0QHY3qMDQtsFMpnBd5exv2riyskmdAAECBAgQIECgSQWuX7u8amWZjsWR6eqpatcwA4GxEzH28A8j/+TDEYX8RAff+sqX41Mfui2+8aU7YvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxOQFDr6YTsb0iBXC5/Q0Smc/LkU7auV67um9xkmwABAgQIECBAoIkFUpb+7QNnVa0wu0S21iqUWTTkn3o0xh74bhSOHpro5fixY/GVv/pCKbj1R3f980S7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLyYgKDWF9OxrzEFhnYWg1kLN1dO/rXDy6OjLVPZrE6AAAECBAgQINDEAjeunyJba/eSiLb2Jl71/C+tcOL5GHtkXylza+G5Zycm8PTBJ+Pzn/5E3P6x/xgH9hczuioECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBB4EQFBrS+CY1djCrS1PXNtMUvr4OTZp2DW1w0vm9xkmwABAgQIECBAoAUEUqb+c89aVLXSbH/Z28Wq/RpmJlA4diTGHvx+5B9/IGLsxEQnD+77UXzmt94fd/zx7XH08OGJdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYLCCodbKG7aYQKBQKeysXkoIZBrpylc3qBAgQIECAAAECLSDwhimytWZ7+ovfg/Lr0Fxd/vzTj8fo/d+N/FOPFm+ikJ8Y5jtf+/v41Idui3/42y/G6InRiXYbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQSAI+xfU6aCqB9nXbRrLZGKlc1E0bV1Y2qRMgQIAAAQIECLSIwGuGlkbK3F9W2toj2yuTf5lJrSv5scg/+XCMPfDdKDxzcKL348eOxT/e8cX4xPtuje/f+a2JdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAQ1Oo10GQCY1VZWrcs646LlnQ22TothwABAgQIECBAYLoCvYva4vrh5VWHZ3qr26oO0jBrgcKJ52Ps0Xtj7KEfROH40Yn+jh4+HH/52f8nPveffiv233/vRLsNAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRaV0BQa+te++Zb+dC2wUIhrq1c2E0bV1Q2qRMgQIAAAQIECLSYwKsvWFq14kzH4sgs7q1q1zA3AimgNQW2pgDXGDsxMUgKaE2BrSnANQW6KgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQItK6AoNbWvfZNt/JcrrAnIlOWknWgKxc7zxao0HQX24IIECBAgAABAmcokDL3T/W+MNvnC1BnSDnrwwvPHIzR++6O/MFHIgr5if6+f+e34hPvuzX+4W+/GMePHZtot0GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQOsICGptnWvd3Csd2lkMZs3vrlzkG9aviI62TGWzOgECBAgQIECAQAsKvHZ4imyt3Usi076oBTUWeMnFYNb8wf0vBLc+/fjEZEZPjMY/3vHF+Ph73hF3/PHtcfipJyf22SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPkFcs2/RCtsBYH29sPXFQqZwclrTcGsrx6qDlyYfIxtAgQIECBAgACB1hF45eq+GO7piH1HnitbdKZvZRSeeLCsTWWeBMZORP7xB6Jw+InILl8dma6e0sApuPU7X/v70mPjlq3xsquujoFV58zTpAxDgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBCCcjUulDyxq2pwNhY5ubKDn9mTX8MdInbrnRRJ0CAAAECBAi0ssDr1i2rWn62t9iW8atRFcw8NhSeezbGHv5hjD2yLwrHjpSN/P07vxWf+a33x+0f+49x7w++V7ZPhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB5hIQ8ddc17MlV9O+9pLthYiRysXfuH5FZZM6AQIECBAgQIBAiwtcP7w8PvidR+K5seI7yPGSbYts3/LIHzow3uJ5gQQKRw/FWPGR6eyObP9gZLqXTMzkwX0/ivRIGVu3XrkrNm3bPrHPBgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSEgHVFzXMeWXkUxHGFvJcD2gbPioiWdlc3qBAgQIECAAAECLS7Q0ZaJFNhaWTK9vhBVabKQ9cLxo6WsrWP33xWFI09GFPIT0zmw/+H469s/Gx97zzvjW1/5chw/dmxinw0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpbQFBrY18/sx/aNlhEuKYS4sb11YEKlceoEyBAgAABAgQItKbA69Ytq1p4ZlFnWVbQqgM0LIhA4cTzMfbYfTF6392Rf+rRiPzYxDyOHj4cX/rzP4nfe++t8ZW/+kKkukKAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGMLCGpt7OvX8rPP5QrFLK2ZspSsA125eOXqvpa3AUCAAAECBAgQIDC1wHBPx5TvF7O91cGuU/egdd4Fxk5E/smHi8Gtd0X+iQcjivXxkjK1fuNLd8Qn3ndr/M0ffjYOHnhsfJdnAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTCDXYPM1XQInBYZ2FoNZD19XDGo92Vbc2rNxoKyuQoAAAQIECBAgQKBS4LXDS+NvH3q6rDnTvSRSxtbC88fL2lXqSKCYqTV/6EDpkYKQM0sGS9cszXD0xGjc9U9fLz3WbX5JbL1iV5w7vK6OJm8qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAicTkBQ6+mE7K9bgfb2IzcUCpnByRPsaMvET5/bP7nJNgECBAgQIECAAIEqgZ1n98ZFSzrje4fKA1gzvSuikDKBKnUvkD/8ZPE7bk9GCkbOLhmITFfPxJx/dNc/R3oMrDonNm3bHhtGtkZ3b+/EfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNSnQLY+p2VWBE4vMDYWuyuPun54eQx0idWudFEnQIAAAQIECBCoFnj1BUurGrN9y4s3AvBrUhVMHTcUjh6KsYd/GGMP/SDS9uRyYP/D8aU//5P42HveGZ//9Cfi+3d+q5TRdfIxtgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqB8B0X/1cy3M5AwEcmsv3RlRGKk8ZarAhMpj1AkQIECAAAECBAgkgfSFqI/e/Vgcfn7sJEgxoDUFtqZb3CuNJVA4fjTGHtkXmfZFkV26KjJnFe/gMClAeTx7a2dXV6zb/NK4+NLL4tzhdY21SLMlQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OQCglqb/AI37/LyNxc/oS5b3vgtZMsaVQgQIECAAAECBAicQqCjLROvGVoav//Dx8uOyC5ZKai1TKSxKoUTz8fYY/dFPP5gZHuWRqZnWWQ6uycWcfzYsbjrn75eevQtXRYbRrbGyL/4v6K3f9nEMTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFgYAUGtC+Nu1NkIDG0bjMhfW9nFa4erbx9beYw6AQIECBAgQIAAgckCN164vCqoNXKLItO9pOpW9pPPs90AAvmxyD9dDFguPlL21kzP8sj2FgNXi9d3vDx98Mn4xpfuKD1WrTk/Nm27LC4sBrmmbK4KAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLzLyCodf7NjThLgVyusLeyi3PPWhSvXN1X2axOgAABAgQIECBA4EUFVne/8D7ybx96uuy4bN+KGDt6qKxNpXEFUvbWwsH9kS8+UtbWlL01ZXGNbNvEovbff2+kxx1/fHts3LI1hi9+Sel54gAbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjMuYCg1jknNkBNBYZ2dkYcuaGyzzesX17ZpE6AAAECBAgQIEBgWgI3Ft9LVga1Zhb3RqZjcRSee3ZafTiocQQKx49GeuQffyAyZy2N7Fl9pefJK/j+nd+K9Pi7P7m9lLk1ZXBNmVwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTmVkBQ69z66r3G03v0xAAAQABJREFUAu25w3sKkVkyuduOtky8ZqiYZUkhQIAAAQIECBAgMAOB7QNnxUVLOuN7h46XnZ2yeQpqLSNpukrhmYMxVnxE9oFS5tZ0zVMm1/Fy/Nix+M7X/r70WDqwMjaMbI11m18aA6vOGT/EMwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECNRQQ1FpDTF3NvcBYPvP6bLZ8nOuHl0fvopO3DS3fq0aAAAECBAgQIEDg9ALXr10e7/zfD5UdmO1bHvmnHo0YO1HWrtKEAvmxyD/9eETxkWlfFJme5ZHtXRaRWzSx2IMHHot/vOOLpUd3b2+svfglcf6FF8X5Gy6KXLtfrSegbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYhYBP3maB59T5FcgNX3JNZGKkctTXrSt+2KwQIECAAAECBAgQmIXAq89fGh/97qNx4NjoyV4y2WL2zv7IHzpwss1W0wsUTjwfhYP7I198pKytKXtrtqd4Z4jsyS/SHT18eCKDawpoTYGtKcA1BbqmgFeFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGZCQhqnZmbsxZCIBO7K4d95eq+GO7pqGxWJ0CAAAECBAgQIHBGAh1tmfjpc/vj939YzNY5qWT7BwW1TvJotc3C8aORHvnHH4jMWUsj290bme4lZQGuoydG40d3/XPpcccf3x4Dq86JtZteWgx03Rir1pzfamTWS4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBWAoJaZ8Xn5PkSWLR+64Z8Pq6tHO/nhvorm9QJECBAgAABAgQIzEhgz8aB+Oy+J+K5scLJ89vaS0GMhaOHTrbZakmBwjMHY6z4SCXT1VN6XWQWF4NcF3WWeRzY/3Ckxz/e8cUYz+J67gXr4ty160sBr2UHqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCYgqLWMQ6VeBfL5TDFL66TgguJEU4bWlKlVIUCAAAECBAgQIFALgYGuXOw8uzf+9qGny7rLLhmIMUGtZSatXikcOxLpkUoKai0FtxYzuWY6u8toJmdxTTs6u7qKGVwvinOHi0GuxcfSgZVlx6sQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaHWBTKsDWH8DCAzt7MzlDj9V/Li4LAXSrZesil9cv6IBFmCKBAgQIECAAAECjSJw55NH49r/+eOq6Y49+P0oPPdsVbsGAmUCxcy+2e5i9taUwbWYzTWK9Rcr3b29cUExyPXs84ZixapzYtWa81/s8Kbf9+H/+5cm1njiR/97YtsGAQIECBAgMHcCmYyPCOZOt756LhTKk0bU1+zMhgABAgQINI9AK72/8v6ieV63VkKAAAEC9S3QKu8vcmu3TlwImVonKGzUq0B77vCeQkVAa++itrh+eHm9Ttm8CBAgQIAAAQIEGlRgy7LuuGhJZ3zv0PGyFWR6l0fh8QfK2lQIVAmMnYj84Scj0qNYJrK4FgNcS0Gu2bayU44ePhx3/dPXS4/xHSmD68A5q0uBrucMDUVv/7LxXZ4JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINL2AoNamv8SNv8Di98d3V67iNUNLo6NNFoFKF3UCBAgQIECAAIHZC/zihSvi//7Gg2UdZXuXRf7gIxHFoEWFwHQFCs8fj/SIQwdKp2Q6FheDW88qZnLtKz1HJlvV1YP7fhTpMV46u7ri7GIG15TNddWaoVi2cqVA13EczwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAk0nIKi16S5pcy2obXjrtcUVbahc1Y0XytJaaaJOgAABAgQIECBQG4GfPq8/PvjPj8SBY6MnOywGH5YCW5969GSbLQJnKFB47tlIj4kg187uUgbXTEdX8d4UZ0XkFlX1ePzYsbj3B98rPcZ35tpzxSDX82PpwMpYVnykzK4rVq2OFACrECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGhkAUGtjXz1WmDumUzh9cWbdpat9JWr+2J1d/WHvWUHqRAgQIAAAQIECBCYoUC6I8Brh5fHR+8uD2DNLhmIfMq4WcjPsGenESgXKBw/GukxUbJtxeDWYqBrMcD1hefuiGJbZRk9MVrK5jo5o2s6pru3N5auWFnM6Hp+Mcj1nDirWBfsWqmnToAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUM8Cglrr+eq0+Nza120bKRTyKVNrWXnt8NKyugoBAgQIECBAgACBWgu8bnhZfPz7j8VzY4WTXbe1F28b3xuFo4dOttkiUEuB/FgUnj1ceox3m2kvfqGv4ydBrimja8fiKQNd0/FHDx8uPSqDXdO+FOjaUczkevZ5Q6WMrim7a9/SpdHbvyztbojytr/7ckPM80wn+ZF/uavqFGutImm4BtfVv9eGe9FWTNhruHVewxWXXrXFBLznaPwL7ud16/y89u/Vv9dGEmj1n02NdK3mYq77f+0X56LbBe9z1W2/XzUHa60iabiGVr+u3l803Eu2asKt/v/cVnoNV138FmgQ1NoCF7lRl1gopCyt5eWiJZ2x8+ze8kY1AgQIECBAgAABAjUWGOjKRbpDwBfuLw9gzfYPxpig1hpr6+7FBAonno84cTAKzxw8eVjK6FoMbk3ZXCO3KDKLOiPT1XNy/xRb+++/t9R67w++V7V36cDK6O7pLWV27S9mek1lYNXq6FzcVQyEXVzcPqfqHA0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE5kJAUOtcqOpz9gJDI0si8nsiMmV9Xb92eVldhQABAgQIECBAgMBcCdy0cWVVUGvplvDFYMLCc8/O1bD6JXB6gZTR9diR0mPywaWsru3FANfOYmbXlOE1Bby+SGbX8XMPHngs0uN0ZTz4NR03eXv8vPPWrhvfLD0LiC3jUCFAgAABAgQIECBAgAABAgQIECBAgAABAgQIEJiGgKDWaSA5ZP4F2nPZGwrFj2Inj5yyZb36/KWTm2wTIECAAAECBAgQmDOBdJeA9PjeoeNlY2T7V8bYoy9kvSzboUJggQVeyOr6fBSePVw1k1JW10z2hYyupUyvXRFt7aUsr1UHn6JhcvDrg/t+VHXUP95R1aSBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBkJCGo9Iy4Hz5dAvpC5OVOepDV++tz+6GiraJyvCRmHAAECBAgQIECgJQVStta3fu3+srVnuos3FSgGA8bYibJ2FQL1LFA4frQ0vZThtbJkFhW/T5he08WS6ep5YXdbbiLgdTwg9oUd/kuAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBg7gQEtc6drZ5nKNA2vPXaYkDrUOXpezYOVDapEyBAgAABAgQIEJhTgVeu7ot0x4ADx0ZPjlPMdpntWxH5g/tPttki0MAChedTNuIXMhJPFfQ6sbRShtfFE9XJwbDjjZmus8Y3X3iuOKd8pxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBcgFBreUeanUgkMnkb47Ils1kPJigrFGFAAECBAgQIECAwBwLpDsFvGH9ivjgdx4pGynbtzzyTz0aUciXtasQaGqB/FhMDnqdvF2LdU8VJFuLfvVBgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQOALlkYONM28zbVKB9nXbRooBrTsrl3fj+uWVTeoECBAgQIAAAQIE5kXg1UNLIwW3lpXirdoz3UvKmlQIEJidQMoYmwJlax0sO7tZOZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA+BQS1zqe2sU4rkM/nd1cedNGSztg+UHELy8qD1AkQIECAAAECBAjMkcBAVy5+Zk1/Ve/Z/pVVbRoIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOYCglpnbufMWgsMbRvMZAo3VHb7ixeuqGxSJ0CAAAECBAgQIDCvAjeur35PmulYHOmhECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBtBAS11sZRLzUQaM+NXReR6ZzcVcqK9dPnVWfFmnyMbQIECBAgQIAAAQJzLZDuHrBlWXfVMNmlZ1e1aSBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBmQkIap2Zm7PmQKAQsbey21cPLY2OtkxlszoBAgQIECBAgACBeRe4aeMU2VoX90a0tc/7XAxIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBZhQQ1NqMV7UB19Q2vPXaYpbWwclTT8Gsb5jiNq+Tj7FNgAABAgQIECBAYL4EXrm6L9KdBMpKJhvZJQNlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgMDMBQa0zc3NW7QWqsrTuPLu3Omig9uPqkQABAgQIECBAgMC0BfZsrA5gzfYuK34/y69W00Z0IAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBE4hUJFm6BRHaSYwhwLt67aNFAr57ZVD/O1DT8ea279T2axOgAABAgQIECBAoL4E2toj29Mf+cNP1te8zIYAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQINJiCdUINdsGacbj6fv7kZ12VNBAgQIECAAAECrSOQ6avO4No6q7dSAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EZAUGttHPUyU4GhbYOZTOG6mZ7uPAIECBAgQIAAAQL1IJDpWByZrp56mIo5ECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGEFBLU27KVrjonncvkbIjKdzbEaqyBAgAABAgQIEGhlgewS2Vpb+fpbOwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECsxcQ1Dp7Qz3MVGBoZzGYtXDzTE93HgECBAgQIECAAIF6Esh0L4loa6+nKZkLAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEGkog11CzNdnmErjv/zs+GnF2cy3KaggQIHB6gUKh0Fl8PJXJyFR9ei1HECBAoP4FtmzZEt/+9rfrf6JmSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToXkKm1zi+Q6REgQIBAUwrsEdDalNfVoggQaFGBm29284EWvfSWTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAjQUEtdYYVHcECBAgQOB0AsUsrbtPd4z9BAgQINA4Atddd10MDg42zoTNlAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUqIKi1Ti+MaREgQIBAcwoUA1qvLWZp3dCcq7MqAgQItKZAZ2dn3HDDDa25eKsmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBADQUEtdYQU1cECBAgQOB0ArK0nk7IfgIECDSmwM033xwpuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgZkLCGqduZ0zCRAgQIDAGQkUA1pHillarzmjkxxMgAABAg0hMDg4GNdc40d8Q1wskyRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoG4FBLXW7aUxMQIECBBoNoF8Pv/6ZluT9RAgQIDASYG9e/eerNgiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTMWyJ3xGU4gUCOB9nXbRgqF/J016k43BAgQqHuBjgtfVvdzNEECBAgQmJ1Abu3Wsg7GHtkXhaOHytpUCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBqAZlap3bROg8Cxdtwy1g4D86GIECAAAECBAgQWDiB7NKzF25wIxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKDBBAS1NtgFa5rpDo0sicjvaZr1WAgBAgQIECBAgACBKQQyHYsjPRQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQOL2AoNbTGzliDgTac9kbIjKdc9C1LgkQIECAAAECBAjUlUC2f2VdzcdkCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUK8Cglrr9co0+bwKEXubfImWR4AAAQIECBAgQKAkkOku3qSgrZ0GAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxGIHea/XYTqLlA2/DWa4udDlZ2fON73xE9S5dWNqsTIECAAAECBAgQaBiB0ROj8elfe18cffrIyTlnspHtWxH5g/tPttkiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgSoBmVqrSDTMtUAmk7+5cozhkU0CWitR1AkQIECAAAECBBpOINeei43bL62ad7ZveUQxuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVML+FT11Db2zIFA+7ptIxHZnZVdb778ssomdQIECBAgQIAAAQINKTCy64pIwa1lpa09sj39ZU0qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFAuIKi13ENtjgXy+fzuyiFWrF4Vay7eUNmsToAAAQIECBAgQKAhBbr7eiLdiaCyZPoGKpvUCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGCSgKDWSRg251hgaNtgJlO4oXKUjTuqb89aeYw6AQIECBAgQIAAgUYS2Hr1rqrpZjoWR6azu6pdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi8ICCo1Sth3gRyufwNUfwYf/KAKYvV5it2TG6yTYAAAQIECBAgQKDhBdLdCNKjsmT7Byub1AkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEDgJwKCWr0U5kdgaGcxmLVwc+Vg67eORK49V9msToAAAQIECBAgQKDhBabM1rq4N6KtveHXZgEECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYCwFBrXOhqs8qgba2I9cUs7RWpaWa6oP+qpM1ECBAgAABAgQIEGhAgeGRTZHuTFBWMtmQrbVMRIUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQITAoJaJyhszLHA3sr+p/yQv/IgdQIECBAgQIAAAQINKpDuSDCy64qq2Wd7+ovf9/KrWBWMBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWl7AJ6kt/xKYe4D2ddtGMpnYXjnSyK7LK5vUCRAgQIAAAQIECDSVwMbtl0YKbi0rbe2R7V1W1qRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAhGCWr0K5lwgn8/fXDnIitWrYvX64cpmdQIECBAgQIAAAQJNJdDd1xPrL91StaZM7/KqNg0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBodQFBra3+Cpjr9Q9tG8xkCtdVDrPl5bK0VpqoEyBAgAABAgQINKfAyFXV730zHYsj09XTnAu2KgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxQQFDrDOGcNj2BXK6wJyLTOfnolK1q3dbqbFWTj7FNgAABAgQIECBAoFkE0l0KBs9fU7Wc7JKBqjYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGUBQa2tfPXneu1DO4vBrPndlcNs3H5p5Npzlc3qBAgQIECAAAECBJpW4NKrd1atLdO9JKKtvapdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFpVQFBrq175eVh3W9sz1xaztA5OHioFs47sumJyk20CBAgQIECAAAECTS8wPLIp0h0LKku2v+ztcuVudQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSUgKDWlrrc87vYQqGwt3LENRdvmPLD/Mrj1AkQIECAAAECBAg0m8DWq3dVLSnbu6z4PTC/llXBaCBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoCUFfHrakpd97hfdvm7bSDYbI5UjTXXb1cpj1AkQIECAAAECBAg0o8D6rSOR7lxQVrJtke1bXtakQoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVYVENTaqld+jtddKIy9u3KIFatXxeD5ayqb1QkQIECAAAECBAi0hEB3X09s3H5p1VozPcVsrQoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIhKBWL4LaCwxtGyx2ek1lx1PdbrXyGHUCBAgQIECAAAECzSyw+codVcvLdCyOzOLeqnYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaDUBQa2tdsXnYb253OjNEZnOyUOlrFTDI5smN9kmQIAAAQIECBAg0HIC6e4Fq9cPV60727eiqk0DAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWk1AUGurXfG5Xu/QzmIwa+aGymE2Xb4jcu25ymZ1AgQIECBAgAABAi0nMLLr8qo1Z7qXRGZR2ffCqo7RQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWYXENTa7Fd4ntfX3n74umJQ6+DkYVMw6+Yrtk9usk2AAAECBAgQIECgZQXSHQx6ly2tWn+mV7bWKhQNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0lICg1pa63HO/2LGxzM2Vo6QP7bv7eiqb1QkQIECAAAECBAi0rMDIVdXZWrN9y4vfD/MrWsu+KCycAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHIMCBQK4H2tZdsL0Tma5X9vfZdvxwrVq+qbFYnQIAAAQIECBAg0LICzz17LD55y2/E6InRMoP8Ew9G/tCBsjYVAq0oUCgUWnHZ1kyAAAECBOZdIJPxEcG8oy/QgN5fLRC8YQkQIECg5QRa6f2V9xct9/K2YAIECBBYIIFWeX+RW7t1QlgaoAkKG7MVKH7kuLeyj8Hz1whorURRJ0CAAAECBAgQaHmBjsVdsfmKHVUOmd4VVW0aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0CoCglpb5UrP9TqHtg0W75V6beUwl169s7JJnQABAgQIECBAgACBosDGHZdWOWQWdUame0lVuwYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0goCg1la4yvOwxlyuUJWltbuvJ9ZcvGEeRjcEAQIECBAgQIAAgcYTWLF6VQyPbKqaeLZ3WVWbBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSCgKDWVrjKc73GoZ2dEfnrKocZ2XVF5Npzlc3qBAgQIECAAAECBAj8RGDjZVurLFKm1kzH4qp2DQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGh2ARGHzX6F52F97e1HbigUMoOTh0rBrBu3V99OdfIxtgkQIECAAAECBAi0ukDK1Nq/ciCeeuxAGUWmZ1kUnnu2rE2FQKsLvO3vvtyUBB/5l7uq1mWtVSQN1+C6+vfacC/aigl7DbfOa7ji0qu2mID3HI1/wf28bp2f1/69+vfaSAKt/rOpka7VXMx1/6/94lx0u+B9rrrt96vmYK1VJA3X0OrX1fuLhnvJVk241f+f20qv4aqL3wINMrW2wEWe6yWOjcXuyjFSQGt3X09lszoBAgQIECBAgAABAhUCm6/cXtESke1bXvxPW1W7BgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDSzgKDWZr6687C23PAl12SzMVI51OYrd1Q2qRMgQIAAAQIECBAgMIXA5it2RLrTQVnJZCPbu6ysSYUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLNLiCotdmv8FyvL1OdpXX1+uFYsXrVXI+sfwIECBAgQIAAAQJNIZACWlNga2XJLllZ2aROgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBphYQ1NrUl3duF9e5YWQoInNt5Sgjuy6vbFInQIAAAQIECBAgQOBFBLa8fIr30LlFkele8iJn2UWAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHmEhDU2lzXc15XMzqau7lywO6+nhge2VTZrE6AAAECBAgQIECAwIsI9CxdOuX76OySgRc5yy4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0l4Cg1ua6nvO3mqGdnRGFGyoH3Hr1rsomdQIECBAgQIAAAQIEpiGw+fLLqo7KdPVEpmNxVbsGAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINKOAoNZmvKrzsKb23OE9xWHK7oWaa8/F+q0j8zC6IQgQIECAAAECBAg0n8CaizfEitWrqhaW6V1e1aaBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSggqLUZr+o8rKkQsbtymM1X7Ijuvp7KZnUCBAgQIECAAAECBKYpsHHHpVVHZnuXRbS1V7VrIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQLMJCGpttis6D+tpG956bURmQ+VQU30AX3mMOgECBAgQIECAAAECpxZIXxTrWNxVfkAmG9me/vI2NQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDShgKDWJryoc72kTKbw+soxTnWr1Mrj1AkQIECAAAECBAgQOLVArj0XF22fIltr/+CpT7KHAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECTSIgqLVJLuR8LWPR+q3FDK2ZYqbW8rL58svKG9QIECBAgAABAgQIEJiRwNard1Wf19Yeme4l1e1aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0EQCglqb6GLOx1Ly+czuynF6ly2N4ZFNlc3qBAgQIECAAAECBAjMQKC7r2fK99dZ2VpnoOkUAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaSUBQayNdrYWe69DOzoj8nsppjFx1eWWTOgECBAgQIECAAAECsxAY2VX9HjvT2R2ZjsWz6NWpBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqG8BQa31fX3qanbtucPFgNZMMbD1ZMm152LzFTtONtgiQIAAAQIECBAgQGDWAqvXD8eK1auq+skuGahq00CAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFmERDU2ixXch7WkS9kbq4cJgW0psBWhQABAgQIECBAgACB2gpsvrL6y2OZs/oj2tprO5DeCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCcCglrr5ELU+zTahrdem8nEUOU8N1+5vbJJnQABAgQIECBAgACBGghs3H5pdPf1lPeUyUa2d1l5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSaREBQa5NcyLleRiaT2V05xvDIpuhf6fanlS7qBAgQIECAAAECBGohkO6IkAJbK0t2SfE9eDG4VSFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECzCfgktNmu6Bysp33dtpGIwjWVXW+8bGtlkzoBAgQIECBAgAABAjUUGNl1RaTg1rLS1h6Z7iVlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBoBgFBrc1wFed4DYVC4fWVQ6xYvSpSplaFAAECBAgQIECAAIG5E+ju64k1F2+oGiDbv7KqTQMBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaXUBQa6Nfwbme/9C2wYj8nsphNu6ovg1q5THqBAgQIECAAAECBAjMXuDSq3dWdZLpWBzpoRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCZBAS1NtPVnIO1tOfGrovIdE7uumNxV2y+YsfkJtsECBAgQIAAAQIECMyRwOD5ayLdKaGyyNZaKaJOgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCwhqbfQrOMfzL0TsrRziou2XRq49V9msToAAAQIECBAgQIDAHAlsefnlVT1nupdEtLVXtWsgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAowoIam3UKzcP824b3nptMUvrYOVQU32gXnmMOgECBAgQIECAAAECtRNYt3VLdPf1lHeYyUa2b0V5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaWEBQawNfvLmeeiaTv7lyjOGRTdGzdGllszoBAgQIECBAgAABAnMokO6UsOnyHVUjZPuWF7+H5te6KhgNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0pIBPPxvyss39pNvXbRuJyO6sHGnz5ZdVNqkTIECAAAECBAgQIDAPApuv2B4puLWstLVHtqe/rEmFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECjSogqLVRr9wczzufz++uHGLF6lWx5uINlc3qBAgQIECAAAECBAjMg0B3X0+kOydUlkzfQGWTOgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpSQFBrQ162OZ700LbBTKZwQ+Uom6+svt1p5THqBAjMn8Cavt4Y7l8Suawf5fOnbiQCBAgQILCwAluv3lU1gUzH4sh0dle1ayBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCVTcu7LRpm++cyGQy+VviOLH4pP7TlmhNm6/dHKTbQIE5kigK5eLVT1nFR89MbC4Kwa6u2O8bTpD7nvqUOmwg8eOxwOHD8f+I0fi/qcPT+dUxxAgQIAAAQJ1LpDunpAejz+0v2ym2f7BGHtkX1mbCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpNQFBro12xuZ7v0M5iMOvhm4tBrWUjrd86Erl2L5cyFBUCNRJIAaubBpbHxStWRMq+2rNo0ax6TtlbUxnuj9i2anCirxTsOh7gevfjT8RoPj+xr542btq6pZSBdrZzetvffXm2XTh/BgLp9Zeu4WxLer1+/Ft3zrYb5zeggJ8BDXjRTHneBS77V6+Iv/zdz5SNm1ncG9HWHjF2oqxdhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAo0kIEqxka7WPMy1re3INcWA1pNRcD8Zc6rbnM7DdAxBoGkFJgeyblqxfF7WmYIN0+OK4mjHRkfj248+Fv/7kUdlcZ0XfYMQIECAAIHaCay5eEOkOykcffrIyU4z2UjZWvNPPHiyzRYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTEBQa4NdsHmY7t7KMYZHNpU+NK9sVydA4MwFlnZ1xivOP78sg+qZ9zL7M1JQ7Y7V55QeB44+G197eH8pyPXI88/PvnM9ECBAgAABAnMqkO6gMLLriviHz3+xbJxsT3/kn3w4olCf2djLJqtCgAABAgQIECBAgAABAgQIECBAgEBkMplYvnx5PPnkk5Gvk7ss9vf3R19fX9x3332uEAECBAgQIEBgQQQEtS4Ie30O2r5u20ihkN9eObtLr95Z2aROgMAZCqzqOSuuGloTIysHzvDMuT98oHtx/Oz6tfGv1l4QX3ngwfjyfQ+UMrnO/chGIECAAAECBGYqsHH7pfGNv7ojRk+MnuyirT2yvcsi//TjJ9tsESBAgAABAgQIEFhggY6Ojli/fn0sWrSoKlAjm83GPffcE0899dQCz9LwBAgQIECAAIGFEXjVq14VN954Y/y3//bf4i/+4i+iUCgszER+Murg4GC8/e1vj5e85CVx2223xf/6X/9rQedjcAIECBAgQKA1BQS1tuZ1P8Wqx4pZWjNl+1asXhWD568pa1MhQGD6Ailg9KfWDsemFcunf9ICHZkrfoiQAm+3rTo77rjnvvjaQ8VMbwoBAgQIECBQlwLdfT2x/tIt8b2vfbNsfpne4nsOQa1lJioECBAgQIAAAQILK3DuuefG7bffHitXrozR0ZNfykpZyVJQ65ve9Kb4/Oc/v7CTNDoBAgQIECBAYAEEXvnKV8b73ve+eOlLXxqbN28uBbT+5V/+5YIFtqb3aymg9aabborOzs7o7u6OW265RWDrArw2DEmAAAECBFpdINvqANb/E4GhbYPFL31dW+mx5eWXVzapEyAwTYFtqwbjrdu2NkRA6+Ql9RSzZvzchvXxK5dti+H+JZN32SZAgAABAgTqSGDkqur36pmOxZHp6qmjWZoKAQIECBAgQIBAqwvkcrk4++yzY9myZaXA1hQskR4DAwOlW+12dXW1OpH1EyBAgAABAi0okAJaP/jBD5YCWtPyU2b7D3/4w3HttdeWvvgz3yTp/dnevXtj9+7dpYDWNP727dvjQx/6UOzatWu+p2M8AgQIECBAoMUFBLW2+AtgfPm5XGFPMUtr53g9PafsT+u2bpncZJsAgWkIdBX/UP/GLS+NX7hoY6TtRi2res6Km4o/A1KAa8riqhAgQIAAAQL1JZDuqrB6/XDVpLJLBqraNBAgQIAAAQIECBBYKIF0C90TJ06ccvh8Pn/KfXYQIECAAAECBJpR4FWvelVZQOv4Gi+88MJSEOm/+Tf/Jtra2sab5/x5cHCwFNC6Z8+eWLx4cdl4L3vZy0rBtq94xSvK2lUIECBAgAABAnMpIEppLnUbpe+hncVg1vzuyulu3H5p5NobNyCvcj3qBOZDYMOypfH2f3FZpOdmKTtWnxNvKgbpNnKAbrNcC+sgQIAAAQKVAiO7psjW2l3MtN7WXnmoOgECBAgQIECAAAECBAgQIECAAAECCyiQyWTimmuuife///0TGVorp7N27dpSwOurX/3qeQlsTRla3/72t5cytJ4qg/7WrVvjAx/4QGnu7e3+7lh5zdQJECBAgACB2gsIaq29acP12Nb2zLXFLK2DkyeegllHdl0xuck2AQKnEdi2arCUobVn0aLTHNl4u4f7l5Syti7tKkvo3HgLMWMCBAgQINBkAsMjm0p3WKhcVra/7O195W51AgQIECBAgAABAgQIECBAgAABAgTmUSBbvCviv/7X/zo+8pGPnDKgdXw6F1xwQSnw9ed//ufnNLB1ckBrZYbW8bmMP19yySWlYNsrr7xyvMkzAQIECBAgQGDOBAS1zhlt43RcvP3T3srZnurD8crj1AkQeEEgBbT+wkUbm5pjVc9Z8dZtWyM9KwQIECBAgED9CGy9elfVZLK9y4rfW/PrXhWMBgIECBAgQIAAAQIECBAgQIAAAQILIJCCWnfs2BEXXXTRtEY///zzS4Gtr33tayOXq/3dVVNA6y233BJ79uyJ0wW0jk+4u7s7li5tnrtVjq/LMwECBAgQIFB/Aj7lrL9rMq8zal+3baT4/nmkctCpPhivPEadAIEXBK4aWtP0Aa3j1zplob1p65ZImVsVAgQIECBAoD4E1m8diXSnhbKSbYts3/KyJhUCBAgQIECAAAECBAgQIECAAAECBBZGYHR0NH73d383PvWpT8Xzzz8/rUmsWbMm3vve98b1119f08DWFNC6d+/eePOb3zztgNZ9+/bFb/zGb8Sf/dmfTWvuDiJAgAABAgQIzEZAUOts9Jrg3EJh7N2Vy1ixelWkh0KAwOkFUkDrT6294PQHNtERXcVvg16/+eJIAa4KAQIECBAgsPAC3X09sXH7pVUTyfQUs7UqBAgQIECAAAECBAgQIECAAAECBAjUhcD9998ft956a3z605+O5557blpzOvfcc+O2226LG264Idrb26d1zosdNJ6hdffu3WcU0Pqe97wn/vt//++RgnMVAgQIECBAgMBcCwhqnWvheu5/aNtgcXrXVE5RltZKEXUCUwtsWrG85QJaxyVSQGsKbFUIECBAgACB+hDYfOWOqolkOhZHZnFvVbsGAgQIECBAgAABAgQIECBAgAABAgQWRmD//v3x67/+66WMrdMNbF29enXpnNkGtq5YsSJuueWWM8rQes8995QytH7uc5+LQqGwMGhGJUCAAAECBFpOQFBry13ykwvO5UZvjsh0nmyJSFmehkc2TW6yTYDAFAJLuzrjFy7eOMWe1mka7l8SV19wfuss2EoJECBAgEAdC6Q7Lay5eEPVDLN9K6raNBAgQIAAAQIECBAgQIAAAQIECBAgsHACjzzySKTMp5/4xCemnbH1nHPOKQW23njjjbFoBndTHBgYmFFAawrA/exnPxv5fH7hwIxMgAABAgQItJxAruVWbMEvCAztLAazHr6hkmPT5Tsi1+5lUemiTmCyQC6bjTeOvDS6cvP7b+XgsePx1PHjpakcK97aY/+RZ6K/szNSgG0qa/p6I81tPsvVFwzFvqeeKj4OzeewxiJAgAABAgSmENh8+WVx/3d/ULYn070kMos6o/D8C+8hynaqECBAgAABAgQIECBAgAABAgQIECCwIAKPPfZY3HbbbaXsp7t3746Ojo7TzmPVqlWlwNb29vb4vd/7vWkHxK5cufKMA1r37ds3kaFVQOtpL40DCBAgQIAAgRoLzG9EVo0nr7uZC7S3H76uUMgMTu4hBbNuvmL75CbbBAhMIXBdMUPrQPfiKfbUtunI88/Htx99LO5/+nA8cPhwpKDW6ZQ0t5HiL6cjKwfmZZ7Xb744fvvr34w0X4UAAQIECBBYOIF0x4XeZUvj8JMHyyaR6V0RhSceLGtTIUCAAAECBAgQIECAAAECBAgQIEBgYQUef/zxUmBrChq96aabphXYOjg4GL/6q78a2WKim5Tp9fhPEuKcaiUpoHXv3r3x5je/ORYvnt7nmymgNWVo/YM/+AMZWk8Fq50AAQIECBCYUwFBrXPKW7+dj41lbq5M6Lj+0i3R3ddTv5M2MwJ1ILBpxfJSsOhcTiVlYP3qgw/GnY8eiNEZ3MrjwNFn44577i09UoDrpWcPxpXnnTtnWVx7irc4uWrovPjzH/54Lln0TYAAAQIECExDYOSqy+Mrf/SFsiOzfcsj/+TDEQW3CCuDUSFAgAABAgQIECBAgAABAgQIECCwwAJPPPFE/OZv/mYpePQtb3lLdBbv0ni6kgJV3/Wud5UCW//rf/2vpwxsnUlA67333hvvec97SgGtY2Njp5uK/QQIECBAgACBOREQ1DonrPXdaW7tpTuLn2iPVM4yfQCuECBwaoFcMRL8p9YOn/qAWe45NjoaX/jhj+Kb+x+dZU8nT08Brl/88T3x1Qceil8oZpjdUMzeNhdlx+pz4q+K48wkCHcu5qNPAgQIECDQqgIXbb80/vHzX4zRE6MnCTLZKAW2Hjpwss0WAQIECBAgQIAAAQKRyWRiUfEL2+kWvo16W91nn33WlSRAgAABAgQaXODJJ5+M9773vTFa/KzwrW99a3R1dZ12RQMDA/HOd74z2tra4uMf/3hUvicYD2jdvXv3tDO0poDWW2+9NW6//fYQ0HraS+AAAgQIECBAYA4FBLXOIW79dp2/OSJTNr3B89fEitWrytpUCBAoF0iBmynz6VyUHxRvE/yH3/1+HHn++bnovtTvJ+/8TqRMsz+38cJI2VVrWVLAb/L56gNubVxLV30RIECAAIEzFehY3BWbr9gRd37pq2WnZnpXRAhqLTNRIUCAAAECBAgQINDd3R0333xzXHLJJQ0buPHzP//zLiQBAgQIECDQBAIHDx6MD3zgA6X3JP/+3//7aQW2rlixIt7xjneUMrZ+7GMfi6NHj5YkZhrQ+u53v7sU0JqCaxUCBAgQIECAwEIKCGpdSP2FGHto22BE/trKoS+9emdlkzoBApMEUhDov1p7waSW2m2mgNZPf+eueclyevfjT8T+Z56JPZdsiaVdp799yZms8qqh8wS1ngmYYwkQIECAwBwJbL5ye3VQ66LOyHQvicLRQ3M0qm4JECBAgAABAgQINJ5AytL68pe/PHbt2tV4kzdjAgQIECBAoOkEnnrqqfjQhz5UyiD/y7/8y9PKsLps2bK45ZZbSoGtH/3oR+Oss86Kt7/97fHmN795WucnxJShNQW0/sEf/EEpW2zTwVoQAQIECBAg0HACglob7pLNbsK5XGFvZQ/dfT2x5uINlc3qBAhMEviptcORspHWuuw/8kx87q7vzktA6/jcDx47Hp/+57vipq1boitXu/8NpMDflAk2Bc4qBAgQIECAwMIJ9K8ciOGRTbHv23eXTSLbuyzGBLWWmagQIECAAAECBAi0tkChUIjnnnuutRGsngABAgQIEKgrgUOHDsWHP/zhUsbWX/mVXykFqZ5ugkuXLo23ve1t0d/fHykT/Rve8IbS8+nOS/tTQOutt94af/iHfyigdTpgjiFAgAABAgTmRaD2EVrzMm2DzEhgaGcxLWP+uspzt169K3LttQtsq+xfnUCjC6RgzW2rikmOa1zGg0uPLcAtPFIw7ce/dWfUeuxdQ2tqrKQ7AgQIECBAYCYCGy/bWnVaytSa6Vhc1a6BAAECBAgQIECAAAECBAgQIECAAIH6EXj66afjt3/7t+MjH/lIHDlyZFoTW758efy7f/fv4o1vfOMZBbSmDK0poPXEiRPTGsdBBAgQIECAAIH5EBDUOh/KdTJGe/uRGyIyZZF5KZh1/daROpmhaRCoT4Edq8+p+cRSMOknv/2dSIGtC1VSYOsd99xX0+HX9PXG0q5i/LxCgAABAgQILKhAytSaMrZWlkzPssomdQIECBAgQIAAAQIECBAgQIAAAQIE6kwgBbZ+9KMfjQ984ANx+PDhac2uq6srOjun9zndfffdV8rQevvttwtonZaugwgQIECAAIH5FBDUOp/aCzzW2FjsrpzCxu2XRndfT2WzOgECPxHIZbOxY/WqmnqM5vPx6e/cFQeOPlvTfmfS2dceejiOPP/8TE495Tnn9faecp8dBAgQIECAwPwJbL5ye9Vg2b7lEdm2qnYNBAgQIECAAAECBOZK4Pni354KhcIpu5cV7JQ0dhAgQIAAAQItLpACW3/nd34n3v/+98ehQ4dqpnHvvfcKaK2Zpo4IECBAgACBuRBwz/m5UK3DPnPDl1wTmahKybr5yh11OFtTIlA/AhuWLY2eRYtqOqEUSLrvqdr94jmbyaUA2y/d90D87Pq1s+mm7NyUrfXbjx0oa1M5tUB6fQ10L441fX3R39lR3O6eODi1p/0ps2/KrDteUiByCop+oPjHjAPPPrugGX/H51QPz+m1d+GyFzIQDvcvKU1pVc9Z0ZUrf7uTXvf3P324+Hi6aJieD9c8uHu+PdIa0/rPK76OUjmvuN1eDMpPpb/4rexTZVBOr6PxwPb0c+mF19bR0ustve4UAgQaW2DzFTviG3/1P+O5Z4+dXEgmG9neZZE/5P/VJ1FsESBAgAABAgQIjAt0F/8ucdFFF8Wi4t8jXiwQdfz40z2PFTMtXHDBBaX+TnXspk2b4sEHH4z29vZTHTLt9kwmU5r3j3/84zhw4PTvedPxac0KAQIECBAgQKBeBY4cORL/+T//5xgt/s3+ne98Z/T3989qqpMDWlOfCgECBAgQIECgHgUy9Tgpc6q9QG7tJX8Wkbl2cs+r1w/Hz/3ynslNtgkQqBB445aXRgpsrVVJwXTv/fuvTQSR1arf2fSTstG+6/IdNQve/cGTB+OTd35nxlO6aeuWGA9InHEnxRPf9ndfns3pc3puWt9I8ZbQmwZW1MQ9va7ufvyJuPvA46XnVF+oktaWruFsSwqw/Pi37jxtN+n1u2VwIK4499xIAawzLcnv248+tuB+051/ClJN1ikzcgqIns3aTzVmCnj9P08+Gd985NGyoOpTHV+r9lb4GVArK/0QmI7AV/7oC3Hnl75afujo8zF6313lbWoE6kygFkE0dbYk0yFAgAABAnUpkII6J5fNmzfHX/zFX8Tg4ODk5hlvp/+nZ4u/u+eKX8ZMz1OVlKk1Bb9WzmWqY6fTli/+XeSmm26Kz3zmM6c9vKenJ971rnfF1q1bT3tsvR7wile8YlpT8/5qWkwOIkCAAAECsxao1XuayomkL+Ls3r27FNi67CcJPiqPOV09BbS++93vjv/xP/5H6f3X6Y4/3X7vL04nZD8BAgQIEKiNwFy9v6jN7GrXS27tyb/PlKcuq90Yeqojgc4NI0Ojo+UBrWl6I7sur6NZmgqB+hNIQWO1DGhNK0wBn+NZEetlxSkA8gdPHIxtq2rzYUXKFqlUC6TX0yvOPz82LK999t8U2JmCZNMjlRSg+fViRuD0emvWkjy3n7Mqdqw+pyoT60zWvGnF8kiP9O/ziz/eF9/c/+hMupnzc9K/0/Q6Suuf65IyBafHFeedW3JJQb8ps3O9/Qybawf9E2h0gS0vv7w6qDW3KDLdS6JwtD4yxze6sfkTIECAAAECBJpJIAWeLl68ODo6OuZtWSlDay2ytE6ecAqinU5Jmc9uueWW6RzqGAIECBAgQIDAggocPXo0Pv7xj5eCUVPG1oGBFz4Tmu6kUkDrr/3ar8Xtt99ek4DW6Y7rOAIECBAgQIDATASm95edmfTsnLoRGB3N3RxRKJtPbzHz5PDIprI2FQIEygVqkS20vMeIO+65t7KpLur3HHqqZkGt6TboKeDu4LHjdbG2hZ5EyqB51dCaiYDT+ZjPeIDm/iPPFIMQ749vP3b62+3Nx7xqNUYK3r3u4o2RgnlrXXqKt1f8hYs2ljK//skP/k/c//ThWg8xo/7SNf2Z9evmJZh1qgkmlxTcmoKIU2DrVx94MI65LdFUVNoI1J1Az9IX3vfv+/bdZXPLLhmIMUGtZSYqBAgQIECAAAECxb8iFzOrNvptaFPW15StVSFAgAABAgQINJvAsWPH4m/+5m/iVa96Vbzyla88o+Xddddd8fWvf11A6xmpOZgAAQIECBBYKIHaR4Ms1EqMO7XA0M5iKrfCDZU7R66SpbXSRJ1ApcDFK1ZUNs2qnrJnpiDDeiwP13heq86a+W3g69FnJnNKQYBv3PLS+JXLts1rQOvkuaaA2us3Xxxv33FZ8Rb1jZ9BNwWx/tyG9aU1zUVAa6XdTVu3lAI5J7fP93YKZk2voRteunnBAlonrzm5X33BULz9X1xWs0D4yf3bJkBgbgQ2X35ZVceZrp7IdCyuatdAgAABAgQIECBAgAABAgQIECBAgEB9Cpx99tmxe/fuuOKKK854gi9/+cvjrW99a5xzzjlnfK4TCBAgQIAAAQLzLSBT63yLz/N47bnDewqRWTJ52Fx7Li7afunkJtsECEwhUOtMrfWapTUtPQXbplvVt9co8+VcBxxOcbnqqindIj5l1UxZa+uhpFvIv3Xb1vjm/kfjCz/8UUNm2EzZf1+36eJ5Dc5Nr+OfXb+2NObt3/1+jM5zlpeUMTa9luqxjGe07e/sqtsM1PXoZk4EFkpgzcUbYsXqVfH4Q/vLppDpXR6Fxx8oa1MhQIAAAQIECBAg0EoC7e3t8bKXvSxWrlzZsMv+0z/904adu4kTIECAAAEC0xdI71f+w3/4D/FLv/RLkd7DnGnp7u6OPXv2RK742dUHP/jBePDBB8+0C8cTIECAAAECBOZNoD6ibeZtua03UCFid+WqN1+xIzoWd1U2qxMgMEkgZbWsZUBiChqt1yyt48v+5J3fGd/0PEOB9Jr5hYs3RsquWY8lBUhuWL40Pv2du+L+pw/X4xSnnNN4UG4t//vbmbUAAEAASURBVE1OOdApGkdWDkQKcE1u81XqOaB1skHK2prLZuKLP75ncrNtAgTqUGDjjkvj8T/6QtnMsr3LIn/wkYixE2XtKgTqVWD/r/1ivU5tVvNaddvvV51vrVUkDdfQ6tf1bX/35Ya7ZtOZ8Ef+5a6qw6y1iqThGlr9ulZesEwmUwp0qGxvpHpbW1tkp/nF7Z6envjABz4Ql1/euHc2S9dspsV7jpnK1c95rf6eo5Vew95z1M+/u5nOpNXfc7TSa3imr5EXO29wcLAU0Ppv/+2/nVFA63jfHR0d8eY3v7n0XikFtt5///3ju2r23ErX2lpr9rJZsI78bPb3mwV78dVoYK/h1nkN1+gl01DdCGptqMt1ZpNtG956bfGMDZVnpQ+0FQIEXlzgwmXLXvyAM9y776mnzvAMhzeaQAq4TLerX9VzVl1PPWXYTPNMAZopO2+9lzTfN468tKZB5jNZcwpUToGmf/i978/k9DM6p1ECWscXddXQmtL1+ZMf/HC8yTMBAnUokL7Y9q07vhxHnz5ycnaZbGR7+iN/6MDJNlsECBAgQIAAAQItLZAv3qXk2Wefjeeee64mDoVCoRQ0kTKCnSrQ9MSJEzE2NhazCc6cPNm0htHR0clNp9xO83vmmWdOud8OAgQIECBAgMBCC6SA1r1795YytKb3VLMtKcvrm970ptIXmd73vvfFfffdN9sunU+AAAECBAgQqLnA7N/11HxKOqyVQCZTeH1E+be0x289Wqsx9EOgWQXOK2ZqrWXZ99ShWnanrzoTWNrVGXsu2RLpuRFKyjr6xi0vLQVofnP/o3U75RQo/KbiPOvFNWW6fbz4wd6X7qv9N5fHL0KjBbSOz3vH6nPiu48/0RCB0uNz9kyg1QRy7blYv3Uk7vzSV8uWnu0fjPzTT0QU8mXtKgQIECBAgAABAq0pcM8998RrXvOaWFT8kmkK+JxtScGqF1xwQfyX//Jfor+/f8rufvM3fzP++q//elZZx8Y7ToGxad4//vGPx5s8EyBAgAABAgQaViAFtN5yyy1x00031TSbfgqOvfHGGyNluE+Brfv27WtYIxMnQIAAAQIEmlNAUGtzXtdYtH7rhuIX0lOm1rKy+fLLyuoqBAhMLXBOjbNtCmqd2rkZWlMm0UYKaJ1sngIoT4zl49uP1V+GvhR4+7rNF9dd5turLxiKux9/PA4cfXYyZU22rzjv3EiBs41afmb9uvjxN74Zo8U3IAoBAvUpsPXqXVVBrdHWHpnFvVE46gs49XnVzIoAAQIECBAgML8CR48ejW9+85s1HfTAgQPx/PPPn7LPu+++u+ZjnnIwOwgQIECAAAECDSIwHtD6lre8Zdpf/nnooYciZa0/77zzTrvKFND6+te/vhQse9ttt/lS0GnFHECAAAECBAjMp0B2Pgcz1vwJ5POZ3ZWj9a8ciOGRTZXN6gQIVAik7JApULFWZf+RZ+LYNG95Vqsx9TM/Ainw8oaXbq6bTKIzWfV1F2+MNTXOTDyTeVSek+a0YdnSyuYFr6drnoKBa11Sv1cNnf6PTLUet5b9DXQvjiuLgbkKAQL1K9Dd1zPl7wMpW6tCgAABAgQIECBAYK4EUtbXlEH1VCXdAlchQIAAAQIECBA4KTAwMBB79+4tZWid7nule++9N371V3813vWud8UPf/jDk529yFYKbL3++uvj1ltvjXXr1r3IkXYRIECAAAECBOZXQFDr/HrPz2hDO4v3v87vqRxs85XbK5vUCRCYQiAFZtWy7HvqqVp2p686EqjXgNAzIWqGwNwzWW8tjk0Btymrai1LygBby2D6qeZ2pJgV5+Cx41PtqlnbfKyjZpPVEYEWFRjZdXnVyjOd3ZHpqO37n6pBNBAgQIAAAQIECBAgQIAAAQIECBAgcFqBFNB6yy23lAJa05eDplPuu++++PVf//X4zGc+E5/97Gfj3e9+97QDW7Ppznmve13p/AsvvHA6wzmGAAECBAgQIDDnArk5H8EA8y7Qnju8pxCZYmDryZJrz8XmK3acbLBFgMApBfo7y/75nPK46e7Y95Tb+U7XqpGOS0GNI8UM2M1QUjDl6zZdHL/zzW81w3LmZQ0pq+rXHno4Rou38ZltSYHF21adPdtuys5P8/rm/kfiu48/UcoUff/Th8v2D/cvKQXRXloct5YZcdNaNixfWhz70bLxVAgQqB+B1euHY8XqVfH4Q/vLJpVdMhBjj91X1qZCgAABAgQIECBAoNkFUgbZvr6+Zl+m9REgQIAAAQINIjAe0PqWt7wlOjo6pjXrFNCaglhTMOt4uf3222O0eBfJ2267LTZs2DDefMrnFNh63XXXlbLrp3O+//3vn/JYOwgQIECAAAEC8yEgqHU+lOd5jHwhc3Pl3ZxSQGsKbFUIEDi9wKqes05/0BkcIaj1DLAa5NAUBJoyUjZTSdlHd6w+pxSo2YjrOnD02UiZSMf/veWymVhT/FAqXataZ19OPqnf5PXVBx6cNdfanwSYzrqjn3SQ5vSl+x4oeZyqz3Gnbz92IFKA68+uXxe1+tl38YoVglpPBa+dQJ0IbHn55XHHZ/7fstlkzuqPeOLhiLETZe0qBAgQIECAAAECBJpZ4NixY/GpT30qvvrVrzbzMq2NAAECBAgQaACBFcW/rb/jHe8oZWg9k4DWW2+9NT73uc9FviIJxx//8R+X2lKQ6kUXXXRagfHA1lwuF7/xG78Rd99992nPcQABAgQIECBAYK4ERDnOlewC9ds2vPXaYkDrUOXwm6/cXtmkToDAKQQGurtPsWdmzceK34RUmkvgZy9cF13FX+prXVJgZgoy3H/kSCkgcXJ2zRRwmMYc7u8vBR9uWrG81sPHT629IO4+8PiLBkPWfNBZdHjw2PH4n/feGz944uCLzjkF7G4/55xiNtTBWYxWfWqtsrWmbKm1KMnjk9/+TqTX0ZmUFOD629/4Zun6XzW05kxOnfLYlPk1ZWytRRbbKQfQSIDArAXWbd0S//D5v46jTx852VcmG9neZZF/Sqblkyi2CBAgQIAAAQIEml0gBbV+8pOfbPZlWh8BAgQIECBQ5wIpoPWd73xn7NmzZ9oZWu8tfj6SMrROFdA6vtw//dM/LQW2vuc974mXvOQl482nfE5Z7F/zmtdECmxNfd91112nPNYOAgQIECBAgMBcCtQ+ImcuZ6vv0woU32jujiiUHTc8sin6m+QW2WULUyEwRwK1DFZMmSP/f/buPzaO887z/Lea3RTJVvOHJFI0TYuUKdGkRY5bIQmRF1EjKhuOYU9izWaACHfBRcB6YPmPOd0O5sYTZ5L4kEwOCBY7wO0fmcXOHmaAxEnm4CT2bHwXJZB34klkj/zzLMeSbMmSLFOyflqiflki2VffdlruqmqS3V1PNau73w9AsJ+nq5566lXdbdH88Fu0yhLQqpZJw5+pGnx+zq6suffY8XmxpqavpJ/LVNjU4ODn7eqaJquQ6mv/gXU98qPfhvu2Mur1zOG3864GquFg/Xrz7Fn54oZ+Y4Fkrdaq4WINIvtp+pry2zTQ+revvCr6vdj27DtH7cB0QvS15adpoFWDxJnXqp+52BcBBIIR0Ds49I8Oy0s/f85xgEhzm8x9aH+mpeYc43QQQAABBBBAAAEEEEAAAQQQQAABBBBAIBiBVatWpSu0aqC1rq4ur4McO3ZMtELrk08+6anQ6p7gpz/9qczOzsq3vvWtvIKtuv8f/dEfiQZcn3jiCXn99dfdU9JHAAEEEEAAAQQCF4gEfgQOUDKB2PqRpP0b6PvdBxzcvMk9RB8BBEokcGuWUEiJqEt2mAkDlSyzF3vg7Dn5zm9eXDDQmr195vHB8xfS1TU1iGiyaTVTDWuGtWk4Vb32TxVeSVCt/8auSGqyenKyfbUvKrU24a0hXz+B1sxJPPvOkcxDX9+DqCTsa0HsjAACHoHkxLhouNXRamJixf0H7R1z0kEAAQQQQAABBBBAAAEEEEAAAQQQQACBnAIrV64sONCqFVq/9rWv5RVozRz0n/7pn9LHeeWVVzJDi37fvn27fPOb35SNGzcuui0bIIAAAggggAACpgUItZoWXcL5UqnUl92Hb+3skK4Nfe5h+gggsICAiYBZZvqLN4qvmpiZg+/hEehILPddxTL7bLSS5ffeeFOKreirt3fX6q5PH34ne1rfj7d1r/E9RxATaJD3uy+/WrSXrkmDn0+9dcjY8rSqqZ/qznfarym/7czVa6KBXRNNKwKbmKsuGjOxHOZAAIEABeJNiZw/J0Ra/IX1A1wyUyOAAAIIIIAAAggggAACCCCAAAIIIFAxAhpoffzxx+XRRx+V+vr6vM6rkAqt7gmfffbZ9PEKCbZ+7nOfSwdbh4aG3NPRRwABBBBAAAEEAhVwleYJ9FhMHqRA90i7yOwuEctxlP6xYUefDgIILC6gt8421W7ZocMgmwZwTd5+3uRaNXCpVTUrqY3cYX/UGmoartRAqzr5bc+feC99u/fk6ja/U6X31+qjP7MrwJpYm5EF2ZNo2PL7hrxe++CM6DmaqCaqnxc6176T7xd1qq3xeFH7Ze906Pz57K7vxxq29muzor7O9zqYAAEEghfY9OBn5chrBxwHspY1iH6lPrrmGKeDAAIIIIAAAggggAACCCCAAAIIIIAAAmYEMoHWXbt2lSTQmln1z3/+c7ELZclf//Vfy/BwfjmCBx98UKLRaLo67P79+zNT8R0BBBBAAAEEEAhUgFBroLylmzwWnd2REsuRIFnWUC+D42OlWwRHQgABj8D0Rzc9YyYH+latkC/e229ySmNzaWjz27/eZ2y+pZ5Iq3GOdd5pZBkaFv3+geIrtOZaxA/ffEta6urS4dZczxcypmHpkY47ig5qFnKsfLbVQKtWaL0+M5PP5nlts+fou76Dm5kD3WNXay021Oqnymvm+CYqq2bm0u8nLl3K7hb12MR5FXVgdkIAgYIE9K4O+nX25JRjP63WOnv6XccYHQQQQAABBBBAAAEEEEAAAQQQQAABBBDwL6CB1r/8y78UDbQ2NDTkNeG7774rTzzxhPzgBz+QOZ/FUvbs2ZOe41vf+pZs2rQpr+P/wR/8gViWJV//+tflxRdfzGsfNkIAAQQQQAABBPwImCtH6GcV7OtbICXymHuSe0eHJRojt+x2oY/AYgImKwzemLm12OF4vkwEelqaxVQV31dPnzFexVaDss++c8SY5obWVcbm8jPR9M2bxgOtuh4NyuqXidaxfHnR09RHa4reN7PjxRs3Mg+NfFdzv63ODoHTEECgPASGJic8C7XizSI1Mc84AwgggAACCCCAAAIIIIAAAggggAACCCBQvEB7e7s8/vjj8uijjxYUaNUw6ZNPPikzhop//PKXv5SvfOUr8sILL+R9MpOTk/Ltb39bfv/3fz/vfdgQAQQQQAABBBAoVoBQa7FyIdqvpmdou4jluSd2rl9Qh2jZLAWBqhC4PjNbFedZDSe5obXV2Gk+/957xubKnkhvG28qqLnOYIg3e42FPt5z9JjRCq3Zxzd1HTQIr9Vti2l1Uf+hseu3zFWw1XPQKst+m8k/DvC7FvZHAIGFBXqSAxJvSjg3siISaW5zjtFDAAEEEEAAAQQQQAABBBBAAAEEEEAAAV8Cn/vc5+Thhx+WeDye1zxaoVUDrT/84Q+NBVozB37uuefSFWOff/75zNCi38fHx2XLli3pqq2LbswGCCCAAAIIIICADwHKaPnAC8uuljW3W8SZT875y+mwLJh1IBByAa14aaoiZ6LWf2At5FxVszyt1GqimawQmms9+0+dlocS63I9VdCYvgc02Hrw/IWC9jO5sVrtO/m+ySkdc2kI2FTramqUA2fPFTzdc8ePy0unThW8X/YO1w39ZXb2nDxGAIHqEdA7OwxsHpMXf7bHcdKRxpUyd8H+fErNOcbpIIAAAggggAACCCCAAAIIIIAAAggggEBxAi+//LJoiPSBBx5YNBiqgdZvfOMbgQRaM6v/53/+53Tl2G9961uLVmC9deuW/OQnP5Gf/vSnmd35jgACCCCAAAIIBCZAqDUw2tJMHFs/kkyl5ra6j5ac2Oweoo8AAnkKXP7oppiqMhiN+L+1eJ7LZrMABfT1YOo1se/9qQBXKvLS1Cl5cN3dRoLZ96xcsaSh1qcOHgrUSiuS6peJa7umyFDrmavXRL9oCCCAwFIKDI6Pyst79spMduXnmphEEi0yd/n8Ui6NYyOAAAIIIIAAAggggAACCCCAAAIIIFAxAq+88kq6OuqMXaziD//wD6WmJvfvEY8dOyZPPPGE/OAHPzBeodWN+S//8i/yla98Rb797W/L1q1b3U+n+xpofeqpp9Ih28OHD+fchkEEEEAAAQQQQMCkgLO8p8mZmaskAnNzc4+4D9Ta2SGdvT3uYfoIILAEAibCckuwbA7pEjBVpVWnPXDmrGt2s12t2mmqumpPS4vZxRUw2/FLl0W/gm6mqrWuqK8PeqnMjwACCAQmEG9KiN7pwd2spjb3EH0EEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CFw4MAB+epXvyrPPPNMzsCqBlq1QmspAq2Z09i3b1862Lp3797M0O3vN2/elB//+MfpkC2B1tssPEAAAQQQQACBgAUItQYMHOj03SPtlpXa6T7G4JYx9xB9BBAoQOAGt/IuQKs6Nm1taDByotP2D/76FXSbmr5i5BBtcTPnXcxiZuZKc7vrs9fMVEltqasr5jTZBwEEEAiNwNDkhGct1rIGserinnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB4gTfffDMdbP3JT34iWgU109599135+te/Lk8++aRjPPN8kN9feOGFdBXZX/ziF7cPk12h9dChYO+ud/ugPEAAAQQQQAABBGyBKArlKxCNzu0U+9fM2WegVZb6R4ezh3iMAAIFCmilS1ONoJspyaWdpy1uJtBTisqjKjU1PW0ELBqJSKK2tiRBXCMLLmKSC9evF7GXd5dyq8rc1dQoen2z21JW5s1eB48RQGBpBPRuD+1ru+T0u8cdC4i0tMvsqSOOMToIIIAAAggggAACCCCAAAIIIIAAAggg4E/grbfekq997Wti35lV/viP/1hOnjyZrtD6wx/+MGcFV39Hy2/v/fv3pyu2plIpGR8fT1eTfeKJJ4QKrfn5sRUCCCCAAAIImBMg1GrOsrQzdW+1w6yXd9uhVsdxe4eSEo1xWR0odBBYQoFYjTM0toRL4dA+BNoMVWo1VUF1sVM5cvHDxTbJ+/k7E8vl4PkLeW9fbhtevHHDyJI1/Ksh0VJVmJ1v0T0tzVIfjUpHIpHeZI0dXo3Z69Kqu7pGGgIIILCQwPDkVvlv//kfHJtYDY0iNTGR2U8qRjg2oIMAAggggAACCCCAwCIClmVJLGb/m3KeFnH90eU8mzGMAAIIIIAAAghUnIBWP9XQ6NmzZ+Vf//Vf5R//8R9LXqHVjfryyy+ng62f/exn06FWAq1uIfoIIIAAAgggUAoB0o+lUA7gGDU10/fbgdb27Kk1zJrrtqHZ2/AYAQQWFzB5e3hCZIt7l8MWpqpwmqqgupiZVhu+cP2GmFh3q12ltpJDrSbf743LatPui10fE89rSFWrrXYsX54OsBJaNaHKHAgg0LWhT/TOD1cvZVX8tiKi1Vrnzr0HEAIIIIAAAggggAACRQnM2P+f4tSpU+kqZPo40zTsqoHW64buopKZl+8IIIAAAggggEA5CRw8eFD+6q/+Sq5evbpkFVrdXq+88oq8+eab8tFHH7mfoo8AAggggAACCJREgFBrSZgDOchj7lkzv4R2j9NHAIHCBLSaZnJ1W2E7LbC1Vk3UkGG1NZNhwaW002Co+zbtxa6nlCZagdREqFVfv5XcNPxbDk0D8gNtrbKhdVU6zFrp16UcrglrRKASBfSP5JIT4/Lrnz7rOL1IokXmzr8vkppzjNNBAAEEEEAAAQQQQCAfgffee0927NghtfbPtnp73eymodajR49mD/EYAQQQQAABBBCoOoFLly6F7pwJtIbukrAgBBBAAAEEqkqgspMqFXopY+tHkqnU3Kj79PR2oTQEEPAvcMFwdQi9HfiBs+f8L6zMZljq27CHkauUoVZT55+onf/2gKaOUSnzaPDUZEhWw9Qb29vkvtWrpW/likph4jwQQCDkAv2jw/Liz/bIzK2sP8ipiUmkcaXMXTob8tWzPAQQQAABBBBAAIEwCmgg4o033gjj0lgTAggggAACCCCAAAIIIIAAAggggEAIBSIhXBNLWlRg1lOltbWzQ9rXdi26JxsggMDiAmeuXVt8owK20FBrNbZKqU5bV6aVSk2FK6ORmmp8+RZ1zqYq+urBx9fcJV/dPCZfvLefQGtRV4OdEECgWIF4U0I02OpuVuMq9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAwLgAlVqNkwY8YfdIeyo1u919lI2f2eweoo8AAkUKnLlqOtTaUuRKynu369kV3sr4VEze5t1U0LSUnLGayv/7F70uK+rrSsk677EGWlfJ53vXh2Y98y6UJxBAoKIFBreMyRvPv+A4R2tZg1j1CUldn3aM00EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRMClR+UsWkVgjmikZTu0QsR/JGqymtH9oYgtWxBAQqQ2Bmbk5MBls7EsvFZDCyXJRn5mbLZakVuc6LN24YOa9Eba2ReZhkYQGt8vqFvl7Zed8ggdaFqXgWAQRKIKB3gejs7fEcKdLc5hljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDApQKVWk5pBz9W91Q6zXn7EDrU6jjRg3544GuNSOlDoIOBT4IIdCGyLN/ic5ZPde1qa5cDZc58MGHq0f+q06Jep1rdyhTy88T4j01288ZGReZZ6kpgdNjTRNCxNQyCXgFaJ3fl7g6IBeBoCCCAQFoHkxGY5efiIYzlWvFmsWK2kbt10jNNBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEETAmYSeqYWg3zLChQU3Nlux1obc/eSMOsg+Oj2UM8RgABAwKHzl8wMMsnU2iotRxaazxubJlnrl41NtdSTnTLUBhVK3GWsiVqY0YOd31mxsg8YZ6kcdnSVaPVKs4PJ+8j0BrmFwhrQ6BKBXqSA6J3hHA3q2m1e4g+AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYEyA8p7GKIOfKJVKPWY5i7TKfL9sDn41HAGByhY4cvGi0RPsaWkxOl9Qk7U11Bub+sy1a8bmWsqJTFZYTdTWyvTN0lS3i0ZqjLBdv1X5odZSB44zF0aPu/O+QaNVoTNzz/f9yMUP009duH5DLtoVqbObBpinpqftgG1CHupdl/0UjxFAoEoFhiYn5Ff/9zOOs480rpS58++LpKhA7oChgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACRgQItRphDH6S2LpPjaZEku4j6S+aaQggYF5gavqKaOhLbwtuoultxfVL5w1z62pqMrY89auEZjKEGqspXbXWxBJWH62E617MORRa1fbBdXdLUFWcdS3HL12WE+mvS+nHha6vGAP2QQCByhK4d3RYfvPTZ2Um+w8c7D+aiDStkrkPz1TWyXI2CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQCgFCraG4DIsvwg60PubeqrWzQ/SLhgACwQhoRcMV9e3GJp+8e638/etvGJsviIna4g1Gpj1z9ZqYrHBqZFEhmKTOvtV8qVrMrgJqormreZqYM0xzaPVcU+2GHSTNt+lxxzrvzHfzvLd77YMzcuDMWdHvNAQQQMCvwDK7gvvg+Ji8uvd5x1RWYqUIoVaHCR0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDAjYCbxYmYtzDKfQPeIpurudz9NlVa3CH0EzAocOn/e6IQDravS1VqNTmpwsr6VK8TUbdinroS7Im0hbJc/ulnI5gtu22gwQLnggewn6w0FaCs9nGyqGvNi18P9/LbuNcbebzr3wfMX5H//1a/le2+8SaDVjU0fAQR8CfSPDXv2t5Y1iNXQ6BlnAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPArQKjVr2AJ9o9GZ3aLWI57oMebEtKTHCjB0TkEAtUrcMgOiZkO9Gm11rC2UYNVI00HgpfSTF8DWnnWRGuNx01Ms+gcGk7uSCxfdLt8Njhz9Wo+m5XtNi11jv+8Fn0e+jq5cP1GXvtr4NhkldanDh6Wv3v1dZm+aS6AndeJsBECCFSFgN4ZomtDn+dcI02tnjEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAb8ChFr9Cga9f/dWO21j7XQfJjkxLtFY6W5j7T4+fQSqQeC6fSvxA2fPGT3VsFZr1QCkrs1UO3jugqmpQjHPhRv5hRUXW2xXU2mq2pk8zplrZgK9i9ks1fMr6uuNHDrfQKserKel2ViV1r3Hjsu+k+8bOQcmQQABBOYTGNy8yfOUFW8Wq9bMHwZ4JmcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqVoBQa8gvfSx2eYcdam3PXqaGWftHvbcBzd6GxwggYEYgiLBYGKu1jt3ZYQbMnmVq+krFVYw8a6haacdyM9VTF7tYHYnEYpvk9bzJKrV5HXAJNjJV0baQ4LOGWk20g3Y16WffOWpiKuZAAAEEFhTQO0Q0rlzh2cZqpFqrB4UBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJUCo1Rdf8DvPzlq73UfpHd4o8SYzgSX33PQRQMApcOTih+mQpnPUX08roiZXt/mbxODeK+rrZKTjDmMzmq5uW+zC9BbvptqZa9eNTNUWbxCT65pvUaYqtRZSfXS+tYR9fE2jmeq5hQSfe1pajLC8QIXWBR1L8V5bcAE8iUCFCSS3bfacUaTJrvJu8SOlB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoWoDfQBZNF/yO0XXDWyMRSbqPlOsXyu5t6COAgDmB/adOm5vsdzPt2NAvfTkqnhk/0CITRu0PmYeT9xm7Fboe7sDZs4scdeGnb83NLbxBns/W21WtTbXjly6Zmip963ljk+WYSK+pqUqgxy9dznGE0gxpMDdRWxvowTTQrV8m2oUbH+U9jYnqsGeuXrPfa+fyPmY5bRjGz4By8mOtCAQlMDg+JnrHCEezA63pYKtjkA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQvACh1uLtSrDnnKdKa/vaLmntNHeb8BKcBIdAoOwFXpo6JXobdpNNg4c77xsUE+E2P+v60uAG0eqhppqJyrbTH900spyWOjNhRV3M1PQVuT4zY2Rdwwar4uZakIalTYVBD50/n+sQJRnT98hD96wP9Fimwr+6yCMXL+a1VlMh2qkrV/I6XjluFMbPgHJ0ZM0ImBbQQKsGW93Namx1D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEihYg1Fo0XcA7do+02/fy3O4+yvDkVvcQfQQQCFhAw4y/OvGe8aNoaO/RoY3GKkUWusDJu9fKQKt922CD7XkDTjdmbhlZkelbjx84Y6YqppqbCp3mghrtvDPXcFFjh85fKGo/UzslV7cFGvze0GomiKWfERp8zqfp+95Eu3D9uolpQjlHWD8DQonFohAoscDgllHPEa3aOrHizZ5xBhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFiBMwkK4o5MvssKBCNph5zbxBvSkhPcsA9TB8BBEogsOfoMZm+aaaCaPZyNXi561OlD7Y+sO5umby7O3spvh+rj4nboV+fmfW9Fp2gLR43Mk9mkqMf5leJM7P9Qt/H13Qu9HTRz2nVXa3UaqJp1V1T1Wn9rGfHvf1+dp93Xw0Wmwp15xto1cWYCjTPzKXmPTe/T5hyKXYdYf0MKPZ82A+BShJosf/YINfPI5EmM38kUElWnAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBQnQKi1OLdg9+reat8ze26H+yBDkxPuIfoIIFAigZm5OXn60NuBHE1vR/6nI0PGwogLLVJDj3qsbd1dC21W1HN7j50oaj/3TqaClMN32AWvDbaD5y6Ivg5MtJGOO4yFG7PXM9Fl7rqaCChnr63Yxx2J5XYAe22xu8+737buNfM+V+gTGgDOt8UMVWpN1MbyPWRB2+lnw/iauwrax/TGYf0MMH2ezIdAuQr0bxryLN1qaBRrWYNnnAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgUAFCrYWKlWD7WPTyLhHLkcaKxqLSO5QswdE5BAIIzCfw2gdn5Pily/M97Wtcqzc+vPE++aJdlVKrtwbRxuzb0v/ZphHpamo0Pr1Wad138n0j85q69bgGeDUQaarpOR48f8HIdHq9vzS4QUzdil4XpUHEkQ7HfzqKXquGd187/UHR+5veUasKmwy26mtD3w+m2oGzZ/Oe6rKhis8diUTex8x3w6RdgVGrOC91C+tnwFK7cHwEwiKglVpbOzs8y7ESKz1jDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKFChBqLVSsBNvPzllfdh+mf3RY4k3mAyzu49BHAIGFBZ46eGjhDXw+q6HEb2z5dDrcaiJ8quG9h3rXyeOfHpMv9PUaDVFmn+pTbx0yVsV0+qOb2VP7emy6Iu2eo+/6Wk/2zj0tzfKgoQChzqXX2VTTgLKGeMPUTAVbNUj8pQFzgeKp6SuiX/m2i9dv5LvpgttphWeTTQOtOzb0m5yy6LnC/BlQ9EmxIwIVJtA/Nuw5o0jTKpFIjWecAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAQgWDKARayArZ1CER7PnW/WOIpyTq4ZcyxHR0EEFgaAQ2v7Tl6zK4a2R3YAjR0p+FW/Tpz9ZocuXhRTly+nK4Sq/2FmgZhdX+t4jh2Z4doqDXoprepN3mr+jPXFj7HQs5Hg3raNHRr4pbmmQCjqQqwWl11+uYt2XvseCGn5dhWr/nO+wYdY347+0+d9jtFIPvr+y5RG0u/B4sJ3ep7Q4Obpq6fnqRWcC6k6etQK+HqWvw0rfbbt3KF7+rBug4NRJusXOvnvHTfMH8G+D039kegUgQGx8fkxZ/9Qj66dv2TU7IiEmlcKXMfFva5+MkEPEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRECLWG7VVgySPuJXVt6Mt5i0/3dvQRQKA0AlqtU0NxA612RbKAm4ZS07dKF+et0o9funy7MmpLXZ2YrtqY72lpOE8DoybbBbuSpX6ZOicNtt5jh/8Onb+QDgZPTU/nXG62ac4Nfjeo199kiFRv9z58R7s8c/jtggKK6vP53vXGX4caUC6k8uhCVkE8p+HLZPtqee7YCfnVifduvw8WO5a+j3b+3qDxoPf+qVOLHdrzvL6+TQTOv2gHdL/zmxeLDmzrGrRqrcmQr+dkixgI+2dAEafELghUnEA0FpV77TtJvLr3ece5RZpXE2p1iNBBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFCBQi1FioW4PZ1fcnumRlru/sQg5s3uYfoI4DAEgv86M23pG1kyEgwrZhT0eqcYWhPHTwUyG3qD50/b7RyZH00KhpuzVRuzWX37V/vS4dpcz2XPaahz4N2QFarZJpqGi58eON96XlfskOSWoU0V8hWr7ueywY7UD3ScYfvap/u9WtIWcO1YW9qoGHg8TWdsu/kVLqa8cUbH4eh3WtPX3c7BBtECF1fC8VUjNV9TIRatVrrF/rvKbgSse63rXtN+j3mt2Ks29tUP8yfAabOkXkQKHeBjZ/Z7Am1SrRWrHizpK5+WO6nx/oRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGCJBAi1LhF8rsPOzER3i6QcTzXaoame5IBjjA4CCCy9gN5C/O//vzfkT+1gqwbsqrE9b1fJ3D8VzG3qtapqmG6H7r6+P7bDvH8xtsl4qFSDstlhWX2dadVUDUBqEDHotufosbyCvUGvI9/51WTy7m57c/36uB25+HGQqqelOTMUyHcNAGvV3mKaVgs2tT4N7epcTx96W177YOFbfmt138+uXWsHotuLWXZJ9wn7Z0BJMTgYAiEVSKz4+OeUI68dcKww0twms4RaHSZ0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgf4HqTGLl71O6Lbu31olM73QfMLlts3uIPgIIhETgzNVr8v033kzfij6s1Q6DotLg4NOH3wlq+nTFUg0NhtVVb4++99iJ3wUqA2NIB6ZNhR8XW6W+nn9lB5XLvZXKa9/J99OB42K89p86bVeZvauYXXPuo+HeLw1usKuvdokGoTPBXt24pa5ONMzakVheVgF8rYYc5s+AnBeCQQSqUCA5sVncoVarPiHWsgZJfXStCkU4ZQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CtAqNWvoKH9Y9HLu1JiOcrKRWNRuXd02NARmAYBBIIQ0ODVf3n19XSwtVoqtmqg8+9ffyMIzttzapjt+KXLxqpZ3p7Y4IO9x46n11eqEKXBpXum0iDk9w68mQ4Rep5kwCOgr08NNRfbtPquvr67mhqLnSLnfhpc1VYJr8ly+AzIeREYRKDKBDp7e6S1s0POnpxynLnVuEpSZ4v/nHRMRgcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpKIFJVZxvik02JPOJe3uD4mCxrqHcP00cAgZAJaFXE7778arpCYsiWZnw5Gsb7T/tfLsm5Ph/yqqEautNwr1Y4LfemFYf12oapadA2rO2pg4dk+uZNX8sL++vb18kZ2hkjQ5BMg0DAAoNbxjxHiDSuFKmJecYZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBYTINS6mFAJnq/pGdouYvW5D9U/RpVWtwl9BMIqoIHASg+2alVaPUe/Yb58r+GBs+dEjxnmpsHLv3vt9ZKEfINyeOrg4VA67zl6TDQwHramr8v9U6d9L0vnKdV7qdDFajXmMIS1y+EzoFBbtkegEgX67TtLxJsSzlOzIhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5CBBqzQMp6E0sK/Vl9zF6kgPpW3m6x+kjgEB4BTTY+p3fvBjKgKBfNQ3xaVXSUlfPfObw2yU/ZqFWGgDUsK9+L7f29OF3ZN/J90O57Bszt+R7dgXZMLnqWn705ltGvLTS795j4bs1t57j377yqpy5Fo4KxOXwGWDkBcEkCJSxQDQWld6hpOcMIi3t9t/t8eOmB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYUIDfMi7IE/yTtb1DdoVWy67U6mz9m4acA/QQQKAsBLTy4t+9+ro8+85R0dBauTcNsf7ot2+lv5bifLRapKkQYZDXQgPN6SCgvd5yaHot9bqG+fbu0x/dTFcy/f6BN0PxXsqEPU0Gu9U/TKHizDnq9+u3ZkLxUi6Xz4BQYLEIBJZQYGhywnv0mphYDY3ecUYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBBQSsBZ7jqRIIRNcN/41I6n/NPlTL6jb5n5/437KHeIwAAmUo0BZvkC8NbJCOxPIyXL2kK85qoDQMt0gf6WiXL/TdI9FIcH+L8e1f7/NdFbQ+GpWHN94nXU3hDfFoKPP7dgXUg+cvBPK67GlplkeHNvqeW6vfHrn4YXqesc477evf63vOYidQM12PhpeDaPqa6Vu5Ioip854zO9CqO33x3n7R952f9ue/fM7P7o59y+UzwLFoOghUmcB/+8//IEdeO+A469SNqzJ78qBjjA4C+QqkUql8N2U7BBBAAAEEEPAhYFn8isAHX1ntyr+vyupysVgEEEAAgTIWqKZ/X/HvizJ+obJ0BBBAAIGyEqiWf19E131SBDS4dFBZXfolWmx3sllkbpf76INbRt1D9BFAoAwFtMLg/7n/ZdFbvIchGJovYaY6q1acDcu690+dTocKNXgX5qZ2/8m+5mGt1PvaB2fkO795MbBAq8lrk/3a02qmTx08vCQVW/V9rNc0qECrmv39628EOv9i10XPUSsNZ7+/Lt4I13utXD4DFrPmeQQqWWB4cqvn9Ky6uFjLGjzjDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzCUTne4Lx4AVi0cjOlFh12UeKxqIyOD6WPcRjBBAoYwG9zXvmFuMjHXfIRNcaWVHveNuH5uw0RLjv5FR6vRrODFs7fumyfGffi7JlzV0y0b1GtCpqWNveY8fltQ8+kH9rV5dd6gqcaqTXVqvuBlWdNYjroO+d7KbB1qnpadl536AkamuznwrssQYpnzp4KPAwrZ7rf7FD5A+s6/FdHbUQDD3u3mMn7K/jgZ9jIeuab9ty+gyY7xwYR6CSBdrXdklrZ4ectf8tkd0izW0y+8Gx7CEeI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMKhDcRNO+SK+eJuZS12313Jw20arCVhgAClSWg4TEN5emX3kZ7/K67pCOxPBQnqdUZnzt+QvZPnQp9sO3jEN7xtOOwHRIeu7ND2uLhrACnrlrttqel2Q7hdi1JuFXX8It335VXT58J/bV1vxmyq4ZmnkuHGu1KsxpsVdegmlYufebw2yUNAaeDx799S163w9Bf3NAfeHBXA84/tgO7uZzVNbtSblDOxcxbTp8BxZwf+yBQ7gIbP7NZ9vzDPzpOw1reInLufZHZW45xOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAuAdKTuVRKMFbTM7TdDrR2uw+lvwimIYBAZQto9Uf90oqtG1pbZeSO9pIHXDXIduj8ebua6Bk5cvHDsgPXSrJaAVe/NNSaXL06bdixfHnoKuGqr35piHmbHW69Z+WKwKvM6vE0QK3XtxybBhfna3rtv/vyqzLWeacdDu80GmrWIOeeo8fSdvMdP+hxDZt+xw7uPrDu7vQ5mj7e1PQVefadI4sGdmfmZk0f2uh85fQZYPTEmQyBkAusH9oov/7p/yNXL01/slIrIpHGlTJ38fQnYzxCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE5hEg1DoPTNDDlmU9IpJyHKYnOSCJFSscY3QQQKByBTRYmglmZgKuXU2N0tbQYDzkqgEwDbNp2PHA2bPpx5Uiq1U19xx99/bp6K3pNehaH43ajgn7e036++0N5nlwa3b+IOU8uxQ0rP7fe+PN9D59drBVw60aatZr77dpGPPguQvpoPIhOxSp13upmh5bX2d+Wj5VQjOVj/U9M3rnnbKxvU2ikUjBh9X1HjhzTl46dcr3ugs++Dw76JqeOnjYDp8elYG2VTJ8xx1FV6bVgPCBs+fSrw19jeRjq8ua/uhmaDzmYbo9XC6fAbcXzAMEKlhA7zjRPzosL/38OcdZRprbZO5D+w8tUsH+t9ZxUDoVKTD1tX9XkefV8c3/6jkvztVDUnYD1X5d//yXzv8WlN0FnGfB/+HfTHie4Vw9JGU3UO3XtewuGAs2KsC/OYxyLslk1f5vjmp6DfNvjiV5ixk9aLX/m6OaXsNGXzhlOFk1XWvOtQxfoK4l89nM/79xvSTKrstruHpew2X34jSwYEKtBhALnSK2fiSZSs3d795vcPMm9xB9BBCoEoHbAdes89XKnhpwbYvHcwYzNcynIT4NdLmDalPT03Lhxkd2eHU6HWBdypBj1imV5KFaZDw0zBfGptU49evpw++kl9fT0pwOt7ba17urqWnRJYf5+mp4Vyuplqodv3RZ9OuZw2+nw+AfB5mj0lJXlzb9ONy83PE+SL8vrlyR9+216nrD2vR9m6nsrGHtgbZW0e+Zc9N162sn+zNAz+36zMdVVk9curRoRdb5zj3zGp3v+TCPl8NnQJj9WBsCfgWSE+Py2t7nZeZW1h9Y1MTEijdL6soFv9OzPwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLgAodYluMCpVOrL7sO2dnZI14Y+9zB9BBCoYgEN24U5cFfFl8b4qWtl0yMXjU9bVRNqAPRjR39VYsOKpkFNrU5LQwABBMIuEG9KiN6B4tD+1xxLjbSslllCrQ4TOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4BUo/D693jkYKUSge6RdZG6Xe5f+sWH3EH0EEEAAAQQQQAABBBBAoOwEhia9t2a2ljWIftEQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBhQQItS6kE8BzsejsDhGrLntqrWY0OD6WPcRjBBBAAAEEEEAAAQQQQKAsBfQuFPrlblqtlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggsJECodSGdAJ5LiTzmnrZ3KCnRWNQ9TB8BBBBAAAEEEEAAAQQQKEuBnNVa480iNbGyPB8WjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpREg1Foa5/RRanqGtttVWtvdh8z1C1/3NvQRQAABBBBAAAEEEEAAgXIR6EkOiN6RwtGsiESa2xxDdBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFsAUKt2RoBP7asud3uQ+T8Za97I/oIIIAAAggggAACCCCAQBkJ6J0oBjaPeVYcaVxp/50fP4Z6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIG0AL9NLNELIbZ+JCkS2eo+XHJis3uIPgIIIIAAAggggAACCCBQ9gKD46Oi4VZHq4lJJNHiGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIZAQItWYkAv4+N+et0tra2SGdvT0BH5npEUAAAQQQQAABBBBAAIHSC8SbEtI7vNFzYKupzTPGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIKAChFpL8TroHmm3rNQO96EGt3hvx+nehj4CCCCAAAIIIIAAAgggUK4CyW3eO1NYyxrEqouX6ymxbgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQCFCAUGuAuJmpo9G5nWL/2jbT1+9atah/dDh7iMcIIIAAAggggAACCCCAQEUJ6N0p2td2ec4p0tLuGWMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQINQa9Guge6sdZk3tdh9GA63RWNQ9TB8BBBBAAAEEEEAAAQQQqCiB4cmtnvOxGhpFamKecQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC6BQi1Bnz9a2qm77ertDrKEGmYNTkxHvCRmR4BBBBAAAEEEEAAAQQQWHqBrg196TtVOFZiRYRqrQ4ROggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYAsQag3+ZfCY+xA5f6nr3og+AggggAACCCCAAAIIIFABAvpHfUOTE54ziSRa7L//40dSDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSxAL9BDPDix9aPJC1LRt2HyHX7Tfc29BFAAAEEEEAAAQQQQACBShHoHUqKhlsdrSYmkcaVjiE6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ3QKEWgO9/rOeKq2tnR3SvrYr0KMyOQIIIIAAAggggAACCCAQJoF4U0L6R4c9S7IaV3nGGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoVINQa1LXvHmlPpWS7e/pct910b0MfAQQQQAABBBBAAAEEEKg0gcEtY55TspY1iFWf8IwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SlAqDWg6x6NpnaJWHXZ02t1op7kQPYQjxFAAAEEEEAAAQQQQACBqhDQu1Z09vZ4zjXS3OYZYwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpTgFBrENe9e6sdZp17xD31wOYxicai7mH6CCCAAAIIIIAAAggggEBVCCQnNnvO04o3ixWr9YwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SdAqDWAa15Tc2W7XaW1PXtqDbMOjo9mD/EYAQQQQAABBBBAAAEEEKgqAb1zRePKFZ5ztppWe8YYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB6hMg1BrANU+lUo+5p9Vf3sabEu5h+ggggAACCCCAAAIIIIBAVQkkt3mrtUYaV9p/F8iPp1X1QuBkEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMghwG8Nc6D4GYqt+9RoJCJJ9xxDkxPuIfoIIIAAAggggAACCCCAQNUJ3Ds6LHonC0eL1EikaZVjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSfAKFWw9c8JeKp0tq+tktaOzsMH4npEEAAAQQQQAABBBBAAIHyE1jWUC+D42OehVsJu1orDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEqlqAUKvJy9890m5Pd797yuHJre4h+ggggAACCCCAAAIIIIBA1Qr0jw17zt1a1iBWvNkzzgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSPAKFWg9c6Gp3ZLWLVZU8Zb0pI14a+7CEeI4AAAggggAACCCCAAAJVLaB3ssj1c1KkkWqtVf3C4OQRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqXoBQq6mXQPdWO8xq7XRPl5wYl2gs6h6mjwACCCCAAAIIIIAAAghUtcDg5k2e89dKrVat4+8EPdswgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClStAqNXQtY3FLu+wQ63t2dNpmLV/1HtbzexteIwAAggggAACCCCAAAIIVKNAT3JAWla3eU7damz1jDGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALVIUCo1dB1np21drun6h3eKPGmhHuYPgIIIIAAAggggAACCCCAgC0wuGXU4xBpWmX/vSA/qnpgGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgCgT4TaGBixxdN7w1EpGke6rkts3uIfoIIIAAAggggAACCCCAAAK/ExgcHxO9w4Wj2YHWdLDVMUgHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqQYBQq5GrPOep0trZ2yOtnR1GZmcSBBBAAAEEEEAAAQQQQKASBTTQqsFWd7MaW91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAoECLX6vcjdI+12qX/DAABAAElEQVT2vTG3u6dJTlCl1W1CHwEEEEAAAQQQQAABBBBwCwxuGXUPiVVbJ1a82TPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLYAoVaf1zcaTT3mniLelJCe5IB7mD4CCCCAAAIIIIAAAggggIBLoGV1W86fnyJNVGt1UdFFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoOIFCLX6ucTdW+tE5na4pxianHAP0UcAAQQQQAABBBBAAAEEEJhHYHDzJs8zVkOjWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA5QoQavVxbWPRy7tErPbsKaKxqPQOJbOHeIwAAggggAACCCCAAAIIILCAQNeGPmnt7PBsYSVWesYYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1Orj2s7OWV927z44PibxpoR7mD4CCCCAAAIIIIAAAggggMACAv1jw55nI02rRCI1nnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgMgUItRZ5XaM9n7o/EhFPSdZcv4gt8hDshgACCCCAAAIIIIAAAghUjYD+geCyhnrn+VoRiTRSrdWJQg8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1FrstbXkEfeu890y070dfQQQQAABBBBAAAEEEEAAAadANBaVe0dzVGttaXduSA8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpWgFBrEZe2ri/ZLWJtd+86uHmTe4g+AggggAACCCCAAAIIIIBAngJDkxPeLWtiYsWbveOMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAxQkQai3iks7MRHe7d2tcuUJ6kgPuYfoIIIAAAggggAACCCCAAAJ5CsSbEjl/roo0t+U5A5shgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEA5CxBqLfTqdW+tE0ntdO+W3LbZPUQfAQQQQAABBBBAAAEEEECgQIHkhPdnK6s+IdayhgJnYnMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEECg3AUKtBV6xWPTyLnsXx70vo7GoDI6PFTgTmyOAAAIIIIAAAggggAACCLgFOnt7pLWzwz0sVuMqzxgDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQWQKEWgu8nimRR9y7aKBVg600BBBAAAEEEEAAAQQQQAAB/wKDW7x/NBhpXClSE/M/OTMggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBoBQi1FnBpanqGtotYfe5dBreMuofoI4AAAggggAACCCCAAAIIFCnQPzos8aaEc28rIpFEi3OMHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJQAodYCLqdlpb7s3rwnOSAtq9vcw/QRQAABBBBAAAEEEEAAAQSKFNA7YfQOJT17R1ra7b8z5MdYDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSIAL8NzPNC1vYO2RVaLbtSq7P1bxpyDtBDAAEEEEAAAQQQQAABBBDwLTA0OSEabnW0mphYDY2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA5AoRa87yWc3PWI+5NtUKrVmqlIYAAAggggAACCCCAAAIImBWINyWka4P9t4Wulq7W6hqjiwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSFAqDWf69idbBaZ2+XedHDLqHuIPgIIIIAAAggggAACCCCAgCGB4cmtnpmsurhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggED5CxBqzeMaxqKRnSJWXfamyxrqZXB8LHuIxwgggAACCCCAAAIIIIAAAgYF2td2SWtnh2fGSHObZ4wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofwFCrXlcw7mUtdu92b2jwxKNRd3D9BFAAAEEEEAAAQQQQAABBAwKbPzMZs9s1vIWkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAobwFCrYtcv5qeoe2WJd3uzXL9YtW9DX0EEEAAAQQQQAABBBBAAAF/AuuHNkq8KeGcxIpIpKnVOUYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKXoBQ6yKX0LKsR9yb9CQHJLFihXuYPgIIIIAAAggggAACCCCAgGEBvUPGwOYxz6yRplUidriVhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSPAbwAXuJax9SNJkdT97k0GN29yD9FHAAEEEEAAAQQQQAABBBAISGBwfFQ03OpoNTGx4s2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDeAoRaF7h+c3NzniqtrZ0d0rWhb4G9eAoBBBBAAAEEEEAAAQQQQMCkQLwpIXrHDHeLtKx2D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEyFrDKeO3BLr17pD0anX3Xvp9lXfaBtv2PXxCtEkRDAAEEEEAAAQQQQAABBBAoncDZk1Py5F//jeeAs++9JamPrnnGGShvgVQqVd4nwOoRQAABBBAoEwHL4lcEZXKpfC+Tf1/5JmQCBBBAAAEE8hKopn9f8e+LvF4SbIQAAggggIBvgWr590V03dBtKyq13qZwPohFZ3e4A61aHah/dNi5IT0EEEAAAQQQQAABBBBAAIHABfSuGfrlblRrdYvQRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB8BQi1znPt7Jowj7mf6h1KSjQWdQ/TRwABBBBAAAEEEEAAAQQQKIHApgc/6zmKFW8WqYl5xhlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHyEyDUmuOa1fQMbbertLa7nxqanHAP0UcAAQQQQAABBBBAAAEEECiRQNeGPtE7aDiaFZFIc5tjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSnAKHWHNfNsuZ2u4d7kgPeX566N6KPAAIIIIAAAggggAACCCAQmIDeOSM5Me6ZP9K40v67RH689cAwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECZCUTLbL2BLze2fiSZSs1tdR9oeNIz5N6EPgIIIIAAAggggAACCCCAQMAC/aPD8uLP9sjMrZlPjlQTk0iiReYun/9kjEcVJzD1tX9XceekJ9Txzf/qOS/O1UNSdgPVfl3//JfPld01y2fB/+HfeO/ixLnmIxfubar9uob76rC6oAX4N0fQwsHPX+3/5qim1zD/5gj+/RT0Ear93xzV9BoO+rUU9vmr6VpzrmF/NS6+Pj6b+f83i79Kwr0Fr+HqeQ2H+5UYzOooZeNynZvzVmlt7eyQ9rVdri3pIoAAAggggAACCCCAAAIIlFog3pSQ3uGNnsNaTW2eMQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC8BAi1Zl+v7pF2y0rtyB7Sxxs/s9k9RB8BBBBAAAEEEEAAAQQQQGCJBJLbvD+jWcsaxKqLL9GKOCwCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJgQINSapRiNzu0U+9egWUOiVYDWD3mrAGVvw2MEEEAAAQQQQAABBBBAAIHSCcx3N41IS3vpFsGREEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuACh1gxp91Y7zJranelmvvePDks0Fs10+Y4AAggggAACCCCAAAIIIBACgeHJrZ5VWPFmkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoDwFCrb+7TjU10/fbVVodZX00zJqcGC+PK8kqEUAAAQQQQAABBBBAAIEqEuhJDqTvrOE+Zaq1ukXoI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA+AoRaf3etUin5hvuydW3oy/lLUvd29BFAAAEEEEAAAQQQQAABBEovMDQ54TloJNFi/70iP+p6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAwE+E2ffZFi60eSkYgk3ddr04OfdQ/RRwABBBBAAAEEEEAAAQQQCIlA71BS9A4bjlYTk0jjSscQHQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKA8BQq3p6zT7mPtytXZ2iH7REEAAAQQQQAABBBBAAAEEwikQb0pI/+iwZ3FW4yrPGAMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBB+AUKt3SPtqZRsd1+qXLexdG9DHwEEEEAAAQQQQAABBBBAYGkFBreMeRZgLWsQqz7hGWcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTCLVD1odZoNLVLxKrLvkxa7acnOZA9xGMEEEAAAQQQQAABBBBAAIEQCugdNro29HlWFmlu84wxgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC4Rao7lBr91Y7zDr3iPsSDWwek2gs6h6mjwACCCCAAAIIIIAAAgggEEKBwc2bPKuy4s1ixWo94wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB4Bao61FpTc2W7XaW1PfvyaJh1cHw0e4jHCCCAAAIIIIAAAggggAACIRbQO200rlzhWaHVtNozxgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCIRXoKpDralU6jH3pekd3ijxpoR7mD4CCCCAAAIIIIAAAggggECIBZLbNntWF2lcaf8dY1X/2OsxYQABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBMAtU7W/3Yus+NRqJSNJ9cXL9ItS9DX0EEEAAAQQQQAABBBBAAIFwCdw7Oix65w1Hi9RIpGmVY4gOAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiEV6BqQ60pEU+V1va1XdLa2RHeq8XKEEAAAQQQQAABBBBAAAEEcgosa6iXwfExz3NWwq7WSkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbIQcJWxKYs1+19k90i7yOz97omGJ7e6h+gjgMASCDz+6TFZUV9n/Mj/8cX9MjV9xfi8pZpwfM1d8lDvOqOHO3LxQ/nuy68anZPJEEAAAQQQQACBpRIY3DIqr+593nF4a1mDWPFmSV390DFOBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwidQlZVao9GZ3SKWIzEXb0pI14a+8F0hVoQAAsYEHupdb2yuUk+UqK2Vbd1rSn1YjocAAggggAACCJSVQMvqNulJDnjWHGmkWqsHhQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEQihQfaHW7q12mNXa6b4WyYlxicaqs3Ct24I+ApUq0NPSLEk76FCOTQOtGmylIYAAAggggAACCCws0L9pyLOBVmq1ah1/1+jZhgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEFh6gaoLtcZi0zvtUGt7Nr2GWftHh7OHeIwAAhUq8NA96yUaKa+Pvo7EchnrvLNCrwinhQACCCCAAAIImBXQSq1asdXdrMZW9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAImUB5JbsM4M3OyiPuaTTQGm9KuIfpI4BABQpotdPJu7vL6sweWNdTdkHcsgJmsQgggAACCCBQcQKDW0Y95xRpWmX/fWPV/QjscWAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTALVNVv9KLrhrfaBRqT7gsyuGXMPUQfAQQqWGDLmrtkRX153H52oHWV9K1cUcFXg1NDAAEEEEAAAQTMCwyOj4nekcPR7EBrOtjqGKSDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJhEqiqUKvI3G43fmdvj7R2driH6SOAQAULRO10++d714f+DMtlnaGHZIEIIIAAAgggUHUCGmjVYKu7WY2t7iH6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQIoHqCbV2j7Tb95rc7rZPTmx2D9FHAIEqENAKqPoV5jbWeWfZVJQNsyNrQwABBBBAAIHqFNj4Ge/PelZtnVjx5uoE4awRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAOBqgm1RqOpx9zXI96UkJ7kgHuYPgIIVInAA+t6RKuhhrElamvlwXV3h3FprAkBBBBAAAEEECgLgcSKFTl/3os0Ua21LC4gi0QAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKhKgXCmuUxfiu6tdSJzO9zTDk1OuIfoI4BAFQm0xRtEq6GGsYU5cBtGL9aEAAIIIIAAAgjkEhjcvMkzbDU0irWswTPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILL1AVYRaY9HLu0Ss9mzuaCwq944OZw/xGAEEqlBAq6FqVdQwtY7EchnpcHxkhWl5rAUBBBBAAAEEECgbga4NfdLa2eFZr5VY6RljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEll6gKkKts3PWl93Ug+Njsqyh3j1MHwEEqkwgGonIQ/esD9VZ77i3P1TrYTEIIIAAAggggEA5C/SPef+YMdK0SiRSU86nxdoRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQqEiBig+1Rns+db+dWUu6r16uX2y6t6GPAALVIZBc3SZdTY2hOFmt0KqVWmkIIIAAAggggAACZgT0DxrjTQnnZFZEIo1Ua3Wi0EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBg6QUqPtQqljziZp7vFpTu7egjgED1CHyh754lP1mtGvvAup4lXwcLQAABBBBAAAEEKkkgGotK75Dn7xwl0tJeSafJuSCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQEQIVHWqt60t2i1jb3VdqcPMm9xB9BBCocgGtjjrWeeeSKkze3S2J2tolXQMHRwABBBBAAAEEKlFgaHLCe1o1MbHizd5xRhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYMkEKjrUOjMT3e2WbbFvM96THHAP00cAAQRkKUOlK+rrZMuau7gKCCCAAAIIIIAAAgEIxJsSOX8OjDS3BXA0pkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgWIFKjfU2r21TmRulxtmcMuoe4g+AgggkBbQKqnbutcsicbne9dLNFK5H8lLgspBEUAAAQQQQACBLIHkxOas3scPrfqEWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDA0ghUbIIqFr1sB1otO9j6SYvGojI4PvbJAI8QQAABl8C4XS21I7HcNRpst2/lChloXRXsQZgdAQQQQAABBBCocoHO3h5p7ezwKFiN/DvMg8IAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAkskULGh1pTII25TDbRqsJWGAAIILCTwkF01tZTtgXU9pTwcx0IAAQQQQAABBKpWYONnvNVaI40rRWpiVWvCiSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQJoGKDLXW9Axtt6u09rmhB7eMuofoI4AAAh6BnpZmSa5u84wHMbAUlWGDOA/mRAABBBBAAAEEykFg/dBGiTclnEu1IhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwJIIVGSo1bJSX3Zr9iQHpKVEITX3sekjgED5CTx0z3qJRoL9iEzU1sq27jXlh8OKEUAAAQQQQACBMhXQO3f0jw57Vh9pabf/LjLYf/t5DsoAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgh4BCrut3a1vUN2hVbLrtTqbP2bhpwD9BBAAIEFBDRwOnl39wJb+H9KA616HBoCCCCAAAIIIIBA6QSSE+Oi4VZHq4mJ1dDoGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlF6i4UOvcnPWIm7G1s0O0UisNAQQQKERgy5q7ZEV9XSG75L1tR2K5jNvz0xBAAAEEEEAAAQRKKxBvSkjXBvtvIV0tXa3VNUYXAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1BZodbuZLPI3C43Yf+Y9/aS7m3oI4AAAm6BaCQin+9d7x420n9gXY+ReZgEAQQQQAABBBBAoHCB4cmtnp2surhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDpBFz3XCzdgYM4Uiwa2ZkSy1FWcVlDvQyOjwVxOOZEAIGQC8zMzYkGU/20gdZVol8Hzp7zM41jX52vb+UKx1ghHT2vW/ZXfbSiPsJvE7TFG2RFXZ2saWoSrWir59li93NVzT1z9ZpM37yZ3vfM1av241syNT0tRy5+KNdnZm7PWc0PelqapS0etw2XSZdtOl9TM22Hzp+X45cuz7dZIOOJ2lp7jQ3S09Iy77XWA+s1nZq+kr7mus4L128Esp5STpo5d702eo30WmWamujzmfPOjOtrXl/7Jy5dkjPXrlWEQ+bc/HxXqzvtz4zszw6dL/M5sn/qtPzot2/5OQT7IoBABQm0r+0SvaPH2ZNTjrOKNLfJ7AfHHGN0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgdAIVlYiaS1m7LcuJd+/osERjFXWazhOkhwAC8wrsOXpMHlh397zP5/uEVlU9eP6CaJjUbzNR/XXfyfdlQ2trRYVaNXiZXN0mA22t6RBfvs4a+tMvbTpHdtNg5iH7ui1FSDOzDl3To0MbM92iv3/71/vyDi7qaywdxrYt9Xu+we6M3+Td3ekQ5YEz59J2r31wpuh1L7SjXrfk6tXp6565hgttn3lOzynTNNip13f/qdPpsGtmPOzfC329a7A7c31ynZt+Nmnw/sCZs+nvJj6rch0nnzFTr/k//+Vz+Rwu/Xmhnxv6+bGQUfZk+t+Fbd1d2UNFPVbnv/rvzxv5b0OhC/jivf0y0tFe6G6e7Z86eFj0vyk0BKpVYGhyQv7f/+v7jtO3lreInLPfF7O3HON0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgNAIVk/as6Rnabgdau91sGz+z2T1EHwEEqkTguF3FUCvz+Q3+aOBuy5q7ZO+x477ldJ5cFUfznVgrNGpYV0Ot5d60quK27jWSbF9dUJA13/Puamq0K5M2Siak+fyJk/L8ifcqtoKrBh/H7dfX+JpO34FnnUvfN/ql4b9n3zmSDnbna7/QdhpK1XXmG0BcaC59b+qXzqfh26cPvX27cu9C+y3Fc/q+/+zatdK3aoXx17sGlzXUqV/aNOD6gh1U1DB+pbbM58dY5515B7czFi/ZIWgTodZMgDyo4Hdmvbm+6+vIRNMgNA2BahboSQ5IvCkhVy9Nf8JgRSTS1CpzF5wVXD/ZgEcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBCkQMWEWi3LekQk5bDSX1ImVpj5pb9jYjoIIFA2AhrG29jeVnDoyX2CGozcP3XKV2BOQ1g6j5+299iJsg9larhvomuNHZi8w/d1yddSQ5pqr4HPSgy3akBvwg4I63mabnrr9oc33pcOSP744KG8K8a619G3coVdObknfSt493Mm+ulKv3ZgVt8je46+a2JKI3Oon16fTODUyKSLTJKu1GtbTE1fSYfxlyJ0ucgSfT2tYesv9N1T9OeHVvjVSs4aevfbtEpsqX31NaX/PfHbNPSsfyhBQ6CaBfSOHgObx+TFn+1xMESaVsncxdP2j5f+q/Q7JqaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKLCkQW3aIMNoitH0nav3G8373U5ARVWt0m9BGoNgEN7PzsnaO+T1sr8j10z3pf8+j++d4KPteBNKCmlUbLtem5622/H//0mBRTXdHEeWfCrX/xP2xKV/f0cz1MrMfvHBps08CpugYRaM1en4ZSd31qY8GhVF2X3ipd16lhvCCbXk8NL39pcIOv95qJNWauzZ9tGilpoDV77eqtFn8xtslIgDN77qAfz/fe1Ne6vp7mez7fdb3wvn1rcQNNA8R+11LoMgYMVep+/YMPCj002yNQkQKD46Oi4VZHq4mJFW92DNFBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHSCFREqHVubs6u0upsrZ0d0tnb4xykhwACVSmwz74Nt1bm89u00mKxlf00XOa3UuPTh9/2ewpLtr+6abjPxC2/TZyEBg4f6l0nf2IHLYMOg5pYb0tdnWcaDZn+2eiI6PdSNa2y++hQ/sFWve7/3r7uWlmzlE3fazvvGyx52DBzjnq+Gpwu5bXJHDvX97Z4g/zpyFA6DFoOr3c9h8ZlzkqkGhzVgK6pz5BXT5+RmTn/FRh1XRpsLWW7x8B7Xs9dDWgIICASb0qI3uHD3SItq91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAQC5R9q7R5pt6zUTrfV4JYx9xB9BBCoUgEN7zz7zhEjZ6+3vC6m7bArC/ppB86ekyMXP/QzxZLtqwE/DdRpsC5sraelObRrW8hKq1Vq5VMTtyBf6Di5ntNQpAZbF7qeGvTTNep11yDsUjQNlGqwtZRNbfSYWkk0jOHRTNi22HB+KS2zj5V5zfn9w4DsOU2GOgfaWrOnDvSxvudNXD/9b4qJUG+gJ8vkCJRQYGhywnM0a1mD6BcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1D2odZYdHaHiOVIzGi1nf7R4dJKcjQEEAi1gAZ49Mtv04qr42vuKmgaDZL5ue26Bo+eKdMqrWqlAb8wt0wVSw24lkNTT1PVKos9Xw0ZfmlgQ85KqPrc/2KHWZd6jXpuGmwt1ToywctSV+0s9BpqKFJDyWGpIpvP+nW9JoKc7mO98P777qGi+nrNNchdita3ykxl5tdOf1CK5XIMBMpGQO/y0b62y7PeyIo7PGMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAsAKl+Q18gOeQEnnMPX3vUFKisah7mD4CCFS5gAZDTVSm29a9Ju8KmR9XrOzxJb/v5Pty4foNX3Msxc6Td6+Vh3rXLcWhCz6mBhL/xK58arISZMGLWGCH2O8Cc1r9VEPSYWga1HYHRrUq67/fNOIrxG363Cbv7l6wqqyJ44XxvBc6L/1c0kq/YXkt5Vprnf2e1DbWeWdgr6fjly7LmavXch2+oDH1LFWYeUOr/6qw12dm5OD5CwWdIxsjUA0Cw5NbPadpNTSK1MQ84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBwAmUdaq3pGdpuV2l1pHs0zJrr9pHBETIzAgiUi4AGQ3914j3fy9VKhxpszac9aIcQ/dwifvrmTfnZO0fzOVSottEgmoYJy6lpMG3Hhv7AA5DFmCSW1aYrBLtDpMXMZXIffR9kqhDr912f2iga8AxT0+saZLVgfX+H8bzzuQbqEtYgtwbN1Tboz5GXTp3Oh2rRbQba/IdNFzuIvpZNVNjVKq0m/sBjsfXyPALlJtC1oU/0jh+OZkUk0tzmGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALBCpR1qNWy5na7eXL+MtK9EX0EEKhagT1Hj4kGRf228TV33Q7zzTeXhvs03Omn6XrLLXzU09JcNhVa3ddGQ2M7f2+wZLcSdx9/vv59q1eH0lS9NGirgVa9RXzYAq0ZT711fRCVNNOvl/sGQ3vemfNf6LsGudUnjO2BdT2+/iggn3PaP3XKyGesvr709RBkW2d/tpo4xmsfnAlymcyNQNkK6B9HJifGPeuPNK60/44y2Pe356AMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDFAmX727nY+pGkSGSr+9rlum2kexv6CCBQvQIaEH360NtGAL7Qd8+C83y+d72vANLU9BXZd/L9BY8Rtic11LjTDvmZCF4t1bm1xRtksWtb6rWZqM4Y1Jo1zPcn9q3stbJmmNtwxx3GlxfmQGi+JxvWYG5XU5OMdDiK8ed7SgVtp3/kcPD8hYL2ybWxOgYRnM4+1gb7vea36fkeufih32nYH4GKFegfHRYNtzpaTUwiiRbHEB0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEAhOoGxDrXNz3iqtrZ0d0r62KzgtZkYAgYoQ0Cp1xy9d9n0uWt1wvlt3awjRb8DpqYOHfK+x1BPo7cyDCDdqwHfvseOilWvdXyaupdtJw3SlCNS5j1uOfQ3z6W3iw970/WiykqxWa57v/R92C/f69Pr9TwMb3MNL2p+8u7tkx3/JrtZqog20tZqYZt457llpV4v02bQyLQ0BBOYXiDclpHd4o2cDq6nNM8YAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEI+AqQxPMQYzP2j3SblmzO+z7QDqm3viZzY4+HQQQQGA+AQ2M/tmmkfmeznv8oXvWy4Gz5xy3r9aQn94220/TOYMIa/pZ02L7amiwx749tqmmFQX3Hjshr53+QPTxfG3P0XfTQdqBtlXy2bVrjQUXtVrrwXMXFjz2fGtiPJwCo3d2yLPvHPW9OA2BljJ06XvBeUygIf2xzjtDUx1aP0dL1bRSq37G+A1n62egrlsrgptuHYnlRj7b9I86aAggsLBActtm+e2+/Y6NrGUNYtXFJXXjqmOcDgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIImBdwpkLNzx/IjNF1Q39pT/x/ZE+uVXV2fvNx7+0iszfiMQIIlIXA458eMxLe+e7Lry54m+WHeteJVlv027SCaHZQTufUuYttGoj663/Zt2CY0pSR3oZanfw2DXJ9dfOY71BYZh1ajVVdiwmHqf+D6+5Oh8sy8xX7/fkT78nTh98pdvd0yPfRIW/Ft6InzHNHDeiduXpNLly/IRdv3EjvpaE4De1peDEsTSvwXp+Zuf0+rY/WSEciIW3xBmOvpexz1eP9xxedQaXs5/N9/KXBDYFUadVrpqHDqenp9Ps/O9iu10+rIPe0tNhGy31Xgs51rnotvvObFxf87Mm1X/aYBtuX4jWfvYb5Hu+fOi0/+u1bOZ829d+D773xZvoa5jyIj8HJu9f6DlLr6+s7+170sQp2RaB6BJ76m7+Vk4ePOE44dfVDmT3lHHNsQMe4QCqVMj4nEyKAAAIIIICAV8CyyvJXBN4TYWRRAf59tSgRGyCAAAIIIGBEoJr+fcW/L4y8ZJgEAQQQQACBRQWq5d8Xdib0tkX5VWrt3loncnm3u0pr/+gwgdbbl5UHCCCQj4BWAU22r/YdnttihyhfOnU6HSLU0OC27jX5HH7ebX5lBykXqkw6745L+IQa+K1yqMvXUN337VCYVk4stmkQ9cSlS/LwxvvSIcBi59H9tHLlz+zKnsWEa/0ct5h9NcD6/Hsn05U2///27i82riu/D/iZ4YxEkSYpSiZNcxWLu7RlaUU2dCjBIioZkrdVFwnQGu1DA7RADLSANw+FgaLoPhRFUBTIQxG0QPOwKZICCZAWSQsH3aDZB7dwgHUT70LaSq7lrO2V/9v6Q8mSLVmW1qbIzqFDi3OHosi5Z4YznM8BuJx75t5zz/nM5Vjc+fJ3V5tvDEQe+YVfCAdHR+o5Te5jYmjzpQ8+/CrIutKAMST92Mhw0sq78Txx7XkracbQ5tQDaW9DHa/7P6+8H8Ug991aDOTGFoPose3duSP83T2PLAaAFzsS/E8MzcYq03cLfiY4RcOGiAHgeN0vD3LHcHC0jW3J7W4TeOnDc0n+yGFieKghodYUVbDjf6c0AgTWJjB17HBNqLXQW6lG31UO4fYXaxvEXgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1CXQdqHWrq7r364EWquSOKVyKUwdO1IXgIMIEOhcgRgcjcHWPFVVo14MycUg2O+//MpioDVPuDPOKVYpbacW13/koV1JphwN7xU+W8uJYsDt9069vFgxMs6v3haPjcHWGJRt1RbXGue31tuKx3BkDC2e+/TT3Nf+Wk1isDAGWV9874M1BbZjODFW1Twze3nxNYxh1FQtVqrNc40dG9udaiqL45y5dDk899PX1+Sy/MQx+H22UnU2Bsp/uVKZOFWLYecfnH1z3fNJdf61jhOvqXh9vP7RR5WvK1+FV9d6fHa/WMU0/izlrWQ8MXR/7uB0dm5fVuithOlyttMXL+YcweEEOkdgfGoixDuB3PjketWii4MjYf5y6/6boGqyNggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0qUD9SZ8NWnDlDoS/kT310oeO2X7bBAgQuJdADAMuVUC8176rPR+DTPH20EcqAbM8LYbJVquymWfsRh0bK0bmCfIuzev7b5zNFTZcGmfpewyoxTHztlh5N08wNu/5Vzs+BqB/+8RP1hxoXT5WvPZXqwy6fN88j2P4Mt7S/geVirfrrUAcg4vf+8mpxSrIeeaw/NjRvr7lm+t6HMO18XpP1WK4Nt6ufr0uS+eP7xXxNUxxnS+NGb/nrTa9fKzUj6NVXO9v/p+XFsPZMcy9VI0177lSVDKN7xXxvwcp28Rw/vHi+2GsYqsRILB2genjx2p2LvYNVv6+su1+ha5Zhw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECrSzQVp/IlR85OFXJCkxlQVf6wDG7j20CBAjcTeD7b/zsbk+tq//4N8bWtX925xiujdUp260dqlQyzdvi2htRDTVWB80bWo6B3dQhtbxeMcQXK9E+/9bbuYaKodh6A5X3OnEMXMbwYZxnnnPEtcawd6qW5zbuBx+sKhSfa0oxYBgDrSlC7Oup1LuWSU+NPNByQe7oFIPR/+aHf7H4XpEqyLrc48S580lej4nhoeXD5n786M6ducdIEdjNPQkDEGgzgT3TUyHeEaSqdZVDsT//z2TVmDYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgSyHxKV/VcC27c/m6lNE7VvIZ2jYb4pREgQKBegVgtMVb7m3pguN4hkhz33GuvJxmnmYMM9/YkqVyZN5y52prj2E//4uRqu9zzuRjyi9dIK7QY0v39//dKkqqLX1b6fC/8vT0PJ11aDGz+3umXk1VYPXPp8mI4OVZKzdvirdzrafG4mQQB7nju6P5fztRfoXWl+f/Rqz8Ng93dYfdA/0pPr6svBrkPjj4YYii8FVq8nqJXrDbayBZfl1MXZitrzxdejiH4WLE1jpe3xXEezVkdOM7jzOylvFNxPIGOE+gd6AuTR2bCqRderFp7ob9SPfkTP1NVKE3cOPev/0kTz9a8U43+2/9cczJrrSFpu45Of13/xf/+87Z7zdYy4d/6W7WVvK11LXKtvU+nv66t/eqYXaMF/Juj0cKNH7/T/83RSdewf3M0/uep0Wfo9H9zdNI13OhrqdXH76TX2lpb/Wq89/y8N/v/b+59lbT2Hq7hzrmGW/tKbMzs2qdS69jBkYWF8FSWQZXWrIhtAgTqEfj+6z9LEkCq59zxmBiYbHRgq965rXZcigqCMbAWQ4uNaq99dCX3a5unumfKdcVA6/d+cipJoHVpXq9eShvMia/n7/zfU8kCrUvzTHWNxJBgPS1eA/Uemz1fDE6m/nmPwcWUFW33V4KZrdDiNf8ffnwiudfd1nby/Pm7PbXm/nidpKruHEPK9QaxlyZ8tvKHG3mqJS+N4zuBThTYN3OgZtmFrT2h0JP/DwhqBtZBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCiQH3Jjg3AK5UWvlOp0tq9/NSxes741MTyLo8JECBQl0AM/MRbsW9Ei2G0GKptx5Yi7Pn6Rx81dOnRN2+AMIbKUlQJzbPQGBb93VMvh9S3XY/jphpzKdAav6du565fTzJkrEJaT9s/lO6W8i++/349U7jnMbHqdAyBpmgPJwzx1jufeB3FEHeq63Mt84iGKa7fieE010uKcOzLFy+uZen2IUBgBYF4R5Dd+/fWPFMcSPMzXjOwDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEQnuEWseOVsKs889kX6+pY0dCqVzfbYSzY9kmQIDAD997P0mYab2SL7zzXltW0YvVCPfmvC12tHq1gVVal16LGFTL23YPDOQdou7jl8Kijaq2mMInLu5PXnu9YT9DqeZY7qrvnz4pAtzRKIZOUwVP43jZduL8hWxXXdvx5zsGWzeqxSBrrPjbzEDr0lp/9OG5pYd1f49h1BSVffOGqWOoP1YG1ggQqF9g8vDjNQcXereHQrm+P5KoGUwHAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVAvUlO6qGaPxGV9enT1WqtI4sP1MMs+47VHs7yOX7eEyAAIH1CMTwz5++0dyKqTGk+MI7765nmi2zb7wtdorQ1oeJKkuuBnP11s3Vnl7Tcw/1b9ythlNVj7zbQm9+MXe3p9bV/0XlZ6hRLVW4sZ5KrTu2dYf4laK9lCAwudo8Tp47H+J7WYr2aILQer3zODN7uWEB6XvN6UTFMG+L7415q6wO9/bkvu7OVP5oINX1kNfE8QTaVSDeGaR/hffDwsAD7bok8yZAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ0gJtEWpdWFj4blZxz4HHQu9AX7bbNgECBHIJxADQax9dyTXGeg7+/us/a9vAUYrKpTFs1ajqo8tfhxS3E09VqXP5vFrl8dVbt1plKqvOI8XruOoJ7vJkytf+zOylu5wlTXcM/6Z6DxsfHEwzqTYbJb4npTCcGM53e/KJoXzHR/YYctYIEMgvMPXk4ZpBiv07K3932Ra/TtfMXQcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBVhZo+U/hyg//0qFKsaupLOJKHyxm97FNgACBegRitdZmVLaLtyA/fbF9bws92L21Ht6qY2ZvfFa13aiNFMHZWKmzniqfjVqTcZsnMNTTk+Rk8TpMcS3eazLxvSVFi5VCO7X96IMPcy89VmrNU806b6XcGHA+e/Xj3OswAAECIUwemQnxTiFVrdgVigP3V3XZIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgv0Dmk7n8A6YeYSGEmiqtI1/fHYZ2jaY+lfEIECCwKBCDli9VAk1HHvqFhor80V/9tKHjN3rw4d7e3KcY7bsv/NbfOpZ7nGYN0Ld1S1NCic1aj/OsTSDFtR7P9O4n19Z2wpx7nbt+PecIXx4eA5kxyN2MIG6SCSccJFZqjevOE2SPfjHYWs8fL2wrlULeCsGnL1xsyh9oJGQ3FIGWFYiB1hhsPfXCi1VzLPRVqrV+3L5/oFS1GBsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWkSgtSu1jh0cqTh9O2t14PjRbJdtAgQIJBX4s7NvNTTIdeLchZCqmmLSha9jsE6s4hiDZlrnCQwnqtTarJ/5NxNW5/xaJXjeiS1W646h0LxtYnioriHyVmmNJ60nTFvXZB1EoEMEJp84VLPSwtaeUOjdXtOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB+gVaOtRaKs09G0Khe/nyegf6wvjUxPIujwkQIJBcIAaafnD2zeTjxgEbOXZDJrzCoDHcmaeC4QpDtkWXUGtbvEzJJ7ljW9U/ReoeP1UF1XtNIN52/srNW/fabU3PDyWoyLymE7XgTifOX8g9q1ipNVZsXW97dGel+mOOFl//lOHmHFNxKIFNIzD4wPCKv4cW+/P9vG4aIAshQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgkEhg/Z+yJzrxPYcZO1pJkBSezu43fbx9blOdnbttAgTaSyBWU23E7cKff+udhlaBbYbytspteDuxdeq6O/G1XlpzDLTWE0pcOn7593g7+2a1q7fShFo7OcgdK+vm/W9AvHZisHW9bWJ4/ccsP8fpi/mrzC4fz2MCBL4U2Pf4dA1FrNRa2JLmjx9qBtdBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoAMFWjbUWi5ff7oSah1Z/pqUKiGqPdNTy7s8JkCAQEMFnnvt9aTjx1DbD997P+mYBmueQN+Wrc07mTNtOoFmhlpT4fVtKacaqi3HOZmiWuvw0LrWPj64PeQNE5++OLuuc9qZAIG1CcQ7hsSKrdlW6F/fz3n2eNsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNwRaNlQ6+3b4Zk70/zy0b5DB0LvQF+22zYBAgQaJhAr9b30wYfJxn/up6+Hufn5ZONt1ECD3Z1ZkSxVxc6Net2cd/0C3aX2rEocbz+fopWKXSmGadsxTl+4mPs9O1ZqXc97x6M7d+Tymr3xWYj/7dIIEGiMwOQTh2oGLg5UqisXWvZX65r56iBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQygIt+clb6eEDRyt3a60pyTr5xEwrW5obAQKbVOD5t94JN+fmcq8u3sb6zKXLuccxwMYJdHrVyo2T37gz562YuXzmqYKmy8ds9ONyV0v+U7HRy/5q/Pjen/d9OwZaY7B1rW1iKF/FxxTVZdc6V/sR6ESBySMzYWvPtuqlVwKti8HW6l5bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUIdCiSYX5Z7Nr2bVnPAztGs122yZAgEDDBeItw2OwNW977rXX8w7RMseXKyGtTmydXrWyE1/zdl3z1VtpKrX2bdnSrgTJ5p2iWvfE8NqCqju2dYfh3p5ccz998WKu4x1MgMDqAqVyKXyzcgeRbCtufyDbZZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToEWi+VNHZwpHLvxqeya5k6djjbZZsAAQJNE4ihpjy3c857fNMWusYTfTE/v8Y9N9du1z//+eZakNXcUyBVgHuuQ39m7gncBju8efXjkLfKbqzUGiu23qvtz1mlNcVc7zVHzxMgEMJj31rhd9PSllDo3Y6HAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGcAvf+dD3nCdZ7eKm08N3sMf07d4TxqYlst20CBAg0TSAG0r7/xs/qOl88NkWl17pO7qCkAnPzC0nHM1jrC6QKcK8l0JhSo29LOclwN+fmkozT7oOcPH8h1xLi6x+Drfdqa9lntTFOX5xd7WnPESCQSKBvx8q/nxYH1laVOdE0DEOAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgUwq0Vqh17Gh3CAtPZ6WnnlyhEk52J9sECBBosECsgHfm0uV1n+XPzr4Vrn/++bqPa+UDNtt61mrdqeteq89m3C9lhdW+LVuaRlQqdiU5180vhFojZKy2nbdNDK8edttWKoXdA/11nyZeq2dmL9V9vAMJEFifwOThx2sOKPT0h8LWnpp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrF2gtPZdG79nuXTtOwuhUHXPxlK5FL556EDjT+4MBAgQWIPAcz99PcTg0VpbrPKYIgy11vM1a79UQb/XProS/vydd5s17dznmb3xWe4xDNBeAimDzOWu5v0tUd/W5gVo2+sVrW+28TqI71d7K3cPqLfFKqyxYuvd3j8frYydp6Lv2cofXqS8Xutdp+MIdIrA7v17w9Cu0XDpg3NVSy707QwLP/fvhSoUGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWIbD2ZNY6Bq1319vzhV+rfNZf1SaPzIStPduq+mwQIEBgowRiYOh7Pzm1UadvmfN+cXs+yVzKlTf9WAFXI9AJAt3rCMTn9Yg/Wyna1Vu3UgyzKcY4ee58rlBrDKzGYOvpi7Mrejy6c+eK/WvtjPPTCBBorsDkEzPhhf/6XNVJiwP3h/krlaDr/O2qfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxNIE3iYW3nWnWv0vgvfbvyWf9Udqd9M6q0Zk1sEyBAYKMFYrj3btUG1zO3Zt6OfT3zsi+BJYFrP/986WHu7/1bmlc9dT0VpVdbWIqf89XGb6fnzly6nLsS6sTw0F2XvPf++qvAxtcpzk8jQKC5AvsqdxTpHeirPmmhGIr9+ULq1QPaIkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQINBZAi0Tag2F8EyWfnxqYvGWjtl+2wQIECCw8QKzN/LfWtct0jf+dTSD1QViWDDFtR7PMtTbu/rJEj0bK4KO9t2XZLTZGzeSjLMZBonXwukLF3MtJVZqja9Ptu0e6A95Qv4x0CqAnFW1TaDxAqVyKeyZrvm7zFAcHGn8yZ2BAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCYVqP1UfQMW2r13aiyEwlPZU+97fDrbZZsAAQIEWkTgSoLbkqeqJtkiJKaxSQVSXOuRJgYXm9FSnmf2s/zh9WasuVnnOHH+Qq5TxUBrDLZm26M781V1PHnufHZI2wQINElg+vix2jN1lUOhd3ttvx4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBO4p0BKh1rm50rPZmQ4+MBxipVaNAAECBFpTIFUFx/FBoY/WfIXNakngUqJqpaP3pameujSvu30f7cvcCvtuO96jP2WV2nucqm2ePnf90xC/8rSJ4aGaw1cKutbsdJeO659/Hs5e/fguz+omQKDRAr0DfSv+3lrcPtzoUxufAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwKYU2PhQ69jR7hDmv5PVnXziULbLNgECBAi0kMClRBUcxwcHW2hVpkKgVmD2s5u1nXX0DPf2hGZUJ05VqfXKzVt1rHLzH5K3WmsMsMaKrUttx7buMNpXf+D5zOylEAPIGgECGydw4PjRmpMXtvWFwtaemn4dBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisLnDnE/XV92vYs+XStUqgtVAJtt5ppXIpTB6ZudPhEQECBAi0nMCbiSoDqtTaci+tCWUE3v3kk0xP/ZuNvt5jWDLVOd795Fr9C93ER548dz5XiDS+Rssrsz66c2curdMXZ3Md72ACBPILjHx9dxjaNVozUKH//po+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrC5Qh+jKWgAALvZJREFUWv3pxj+7EMIz2bPEQGsMtmoECBAg0LoCsYpj/IpVBvO0WFUyVq+8OTeXZ5hVj40Bsqd/cXLVfdby5B//1U/DiXMX1rKrfTaRQLzdfLw+U1RZPTD6YDhz6XLDdPbu3BH6tmxJMv7rH32UZJzNNki8FuJrOPVA/bcWnxgeCkth1P2V96d6W3wPTvUHBvXOwXEECHwp8Ni3Dofn/+C/VXEU+3eG+SvnQ7j9RVW/DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE7i6woZVau8ann6pUad2bnV78QFAjQIAAgdYXSBGmilULY9CvkW24tzfJ8Nd//nmScQzSfgJnZtMEUWPAOlXodCXFQ7u+tlJ3XX2vf3SlruM64aBYrTVPi9dBfO+LXw8Pbq97qNMXL9Z9rAMJEEgr8Mj0Y6F3oK960EIxxGCrRoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gU2NNRaKCz8Wnaq41MToW/Hjmy3bQIECBBoQYFUlRyfHHtoMdzVqCWO9t2XZOhrnwu1JoFsw0He+vhqslkfeWhXsrGWDzTc2xNipdYULQbWG1k9OcUcN3KM1yqB31gltd4Ww6wx2Bpfr/i43nbyvMrR9do5jkBqgXinkX2HDtQMW9xeqepcCbdqBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTWDDPl0rP3JwqvLpXqVSa3WbPPx4dYctAgQIEGhZgXgL7rn5+dzzi5UrZxJWmMxOaPS+NKHWqzlCbNk52W4vgdcuX0lyrcdVH6xUJm5EtdZju3cnQ40/29rqAnkDpRPDQ2H/0NDqJ1nl2dkbn4X4pREg0DoCU8eOhBhurWpd5VDo6a/qskGAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwN0FNizUurBQW6V1aNdo2L1/791n6xkCBAgQaCmBGGh96YMPk8wpVmvdVsoEQRKMHKtXxq+8LVZlVLkyr2L7Hn+9UqU3VudM0WKg9R9P7s9VoTM7jyMP/UIlLDuS7a5rO/5cn77gtvb3wjt5/vy9dln1+cVKrffXX1k3b6h21cl5kgCBugR6B/pW/H22OJjm/bmuSTmIAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJsJbEyodWxqewjz38la7ZupvV1jdh/bBAgQINBaAicS3f46Bv1+ffqxpMHWeFvvp//GZBKweDt2rbMFnn/r7WQA44Pbw688/I0k48Wx/t6eh5OMFQeJQfUY4tVWF4hB9zxB5/j+lKdi7+mLgserv0KeJbAxAo//yt+uOXGhuzcUtub/A5uagXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2IQCGxJqLZeKT4dQ6F7uubVnW5g8MrO8y2MCBAgQaAOBc9c/DakCn6N994V/lLCC5T/Y+2iSKq3xZXjr46tt8GqYYiMF4rUev1K1WF31ybHduYbbPdAfnv7FNMHtpYmkCqovjbeZv588l69aa7028T03hmo1AgRaTyDefSR+ZVtx+3C2yzYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisIbEiodX6h8Gx2Lt88dCCUyulvO509j20CBAgQSC/w5++8m2zQvTt3LIb08lQwjBUQj3/j60lvx/7a5TS3nk8GZaANEUhZrTUu4Jcr1Vr/5czjIV7362k7tnUv/pz8s4PTSasbn7l0OWlwdz1rasd9o9fNubmmT/30xdmmn9MJCRBYu8D08WM1OxfuGwyhq1zTr4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWqBpqdIu8annyoUwlj1NEJY6YO/7D62CRAgQKA1BeItuGO4a2Lo/iQTjAG/f3V4ZvE26C+8896ab4Uew6wzu75WqX75UK7bemcXEdfnduxZlc7cjtd5vB7WG0JdTWu4tyf808d+cXHcWPkzXmvvfnItzM3PVx0Wq7JuK5XC/srP2cHRB0O83lO2eL4/feNnKYfc9GNFs9MXLi6+7zRrsfGcZ2YvNet0zkOAQB0C41MToXegL9z45PqdowvFUBwYCvNXzt3p84gAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRqBpodaC4XCMyEsVE1k6UO/qk4bBAgQINBWAjEMF4N+qYJ2cZx4e/YYUj2xGPT7olJB8vpiVcSlwF8M+cX9xgcHK98Li0G/PBVe7wb+ow8+vNtT+jtQ4E9ee32xumqqa32JMP78LA/Lxgqg565/GmLotRHX9dJ5l74//9Y7bmm/hLGO7y99eK6podazVz8Wsl/H62NXAhshEO9AMlH545wf/9nzVacvDtwf5q9eqPw6XP1HC1U72SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ4QJNDbWWHzk4tbAw/+2s+dSxw9ku2wQIECDQZgJXbt4KP3zv/UqV1N1JZx6DgzHYulEtBmhjZU6NwJJAvNZjBeHj3xhb6mrI91iVdXxwe0PGzg46e+OzxZ/fbL/tewvE4HH8Gu277947J9gjVvPVCBBofYHJI4fCT55/Icx9MXdnsl3lUOjdHhY+9e+KOygeESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgWSHvf2uqxa7bm5+crVVqr29Cu0bBrz3h1py0CBAgQaEuBWOnxzUoVwc3S4m2+n6tU5dQIZAVeeOfdTXOtx4qwf3jm1RCvd60+gRPnK5UXm9Dia3Tm0uUmnMkpCBDIK9A70Bf2HHisZpji4AM1fToIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIELgj0LxQ69jBkUJh4ek7p/7y0eQTM9ku2wQIECDQpgIxcPX7L7+yaW5hHivPxgqMGoGswNK1Hiuctnv7L6+86jrP+SKevnCxKaHgUxdmm3KenBwOJ0DgrwWmnqy9I0lha0+IXxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisLNC3UWi7d/tUQCt3LpxGr1+w7dGB5l8cECBAg0OYCserj751+ue2DVyfOXQg/OPtWm78apt9IgaVrPX5v1/bca2+E1z5yG+y8r9/1zz9vSgXVly9ezDtVxxMg0ESBeFeSka/vrjljcceDNX06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBD4UqBpodaFEL6bRY+B1lK5lO22TYAAAQJtLhCrV/7HEz8J7Rr2i4HWP/6rn7b5q2D6zRC4cvNW+N5PTrVldeLvv3E2vPTBh81g6ohznDx3vqHrjMHZs1c/bug5DE6AQHqBA8eP1gxa6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEQmhJq7RqffqpSpXVkOXgMs04dO7K8y2MCBAgQ2EQC565/Gn67EmyNob92agKt7fRqtcZc47X+O//3VIhh7nZoc/Pzi6HtF997vx2m2zZzjBVvY/C0Ue3M7KW2r4DdKBvjEmhlgd3794Z4h5KqViiG4vbhqi4bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh8KdCUUGvlVDVVWlf8cM+rQoAAAQKbSiCG/GLYL4b+2qEJtLbDq9Sac4zh7RjifveTa605wb+eVaye/PsvvxLita6lF3jpg3PpB/3rEU+e95o1DNfABBooEP+Yc/r4sZozFPt3Vv7us1m/jtecXgcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBlhVo+Kdo5UcOThUK4VBWYKXbMGb3sU2AAAEC7S8Qw37//scnwg/OvtWyVQZj0O/5t95ZrF7Z/uJWsFEC8TqKwdZWvdZPX5wN/+4vfxxiRVGtMQInz59vyMDxfbTVA9MNWbhBCWwSgT3TUyGGW6taVzkU+warumwQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIBBC5pO19CTz8/PPVkKtVW1o12gY+fruqj4bBAgQILC5BV54591w5tKl8A+/uS/sHuhvmcW+9MGHi4HWRt42vGUWayJNEYjX+umLF8Pf3/to2LtzR1POudpJ4rX9x6/+VJh1NaREz8Xw6ZtXPw7jg9sTjfjlMPF60ggQaF+B3oG+sO/QgfDKiz+qWkRhYDiEax9V9dkgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OkCjQ21jh0cKRRu/2rlvopVzivdfrFqBxsECBAgsCkFZm98tljJMgb9jo3tTh78Wg9arFb5p2/8LMQ5aQRSC8Rw4++dennxGo/X+kaEW+Mc/tfbb4dTF2ZbtkpyavdWGC9Wa00daj15/kIrLM0cCBDIITD5xExtqHVrTyh094aFWzdyjOxQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptLoKGh1lJp/ulKoLV7OVmsUjM+NbG8y2MCBAgQ6DCBGCiNX6N994UnK4G/Rysh122lhv4naVF4qYpiDJ3FaooagUYLxOssfjXzWo/nixWIT1+cbfTyjL+CQAwR/909jyR7Tzt3/VPh+xWcdRFoN4F4t5Jde8bDB2+8WTX14uBIuH2+uq9qBxsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkygcQmisaOVMOu1Z7NVWicOz4RSuXGn7bDXz3IJbEqB965dC1dv3cq9tptzc7nHaNUBUhmdu359Q5cYw1p/+Mqri3OIlSz3D90fJoaHQt+WLUnmNTc/vxiefb0SoH3z6tWmB8PiNZgiPHvps8ZWk71662aSeTb6Zy7VdZ/k4lrnINlrPQa59w8NhR3bqv72Z52jfrn79c8/D69dvhJe/+ijyteV0OjXYbVJtss1v9oa8j4X33dOX7gYZnZ9Le9Qi8cLJydhNAiBlhCYOna4JtRa6N0eQlc5hNtftMQcTYIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDARgs0LF3a1XX925VA68jyBcYw6+SRQ8u7PCZAgECNwFLIseYJHV8JbEajpeqtz732xmKFw1jZcrSvL+zo3rr4fWnxpWIx7B7oX7yd+rufXFvqDu9+8kmlb2Fxe/bGjcVg9PLnv9qxiQ9ikPF7PznVxDPWd6oT5y6E+NXqbbNc90vX+vffOLtIHm9VH8OtQz09lWt74J4vQwyjX7n18xC/x2tsI0Os2cm2yzWfnXfq7bW8jms954lz59e6q/0IEGhxgXjHknjnkhufVP9RUazWOn/5/RafvekRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaI5Aw0KtCwvhNwqF6kUsfYhX3WuLAAECBAhUCyxVe0xR5bR6ZFsEWk8gXudvXm29eZlRfQJfBvLvq+/gzFExAB0r8WoECGwegenjx8IP//ufVi2o2DcY5j/6MISF+ap+GwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6UaDYiEWXHzk4VSmkN5UdO36ApxEgQIAAAQIECBDYrAIzXxtNtrSXL15MNpaBCBBoDYFvHjoQ4h1MqlpXORT7d1Z12SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQqQINCbWGcPu7WdChXaMhfmkECBAgQIAAAQIENqNA35Yt4eDog0mWNjc/H05dmE0ylkEIEGgdga0928LkkZmaCRX676/p00GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgEwXSh1rHDo4sLISnspiP/8rfznbZJkCAAAECBAgQILBpBJ4ceyiUKrcrSNFioDUGWzUCBDafwL6ZAzWLKmztCYWe/pp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6TSDNp+7L1Eqlhe+EUOhe1hV6B/rC7v17l3d5TIAAAQIECBAgQGDTCMQqrTO7vpZsPSfPn082loEIEGgtgXgHk5V+Py4ODLXWRM2GAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwAYIFJKec+xod6l07e1KqHVk+bh/86lfDgf+zrHlXR4TIECAAAECBAgQ2BQC20ql8I8m94e9O3ckWc+Vm7fCb/7FS0nGMggBAq0p8ObpM+F//qc/qJnc7XdfCQtffF7T34kdC5VbwGgECBAgQIBA4wUKhbQfETR+xs5Qr4B/X9Ur5zgCBAgQILA+gU7695V/X6zv2rA3AQIECBCoV6BT/n1Renj6K6KklVrL5Wu/mg20lsqlsO9Q7e0Vv5qBBwQIECBAgAABAgTaVCBWaP316ceSBVojw8nzF9pUw7QJEFirwPjURBh8YLhm98LAAzV9OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0kkDSUOvt24Vns3h7DjwWegf6st22CRAgQIAAAQIECLStwI5t3eH4N74e/vmhg2G0776k63jpgw+TjmcwAgRaU2DyiUM1Eyv276z8nWjSX9NrzqGDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQCsLlFJNrvzwLx2q3BxwKjve1JOHs122CRAgQIAAAQIECLScQAyqDnZ3rziv0b6+sK1Uqnx1hYcGBsLugf4V98vbeebS5XD9c7cez+voeALtIDB5ZCb85f/4QZj7Yu7OdItdoThwf5j/ePZOn0cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkggWai1Emj9btZt157xMLRrNNttmwABAgQIECBAgEDLCRx48MFK9dWxDZ3X82+9vaHnd3ICBJonUCqXQgy2nnrhxaqTFvoq1VqFWqtMbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHSOQJpQ69jBkRBufzvLNnVMldasiW0CBAgQIECAAAECKwnEKq3nrn+60lP6CBDYpAKTTxyqDbVu7QmF3u1h4cbHm3TVa1tWoVBY2472IkCAAAECBAgQWJOAf1+ticlOBAgQIECAwDoE/PtiHVh2JUCAAAECBNYlUFzX3nfZuVSaezaEQtW9WnsH+sL41MRdjtBNgAABAgQIECBAgMByAVVal2t4TKAzBAYfGF7x9+Zif6Vaq0aAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgAwXyh1rHjlbCrIWns3bTx49lu2wTIECAAAECBAgQILCCwIvvva9K6wouugh0gsDk4cdrlhkrtRa2VP3daM0+OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAhsRoHcodZy+frTlVDryHKcUrkU9kxPLe/ymAABAgQIECBAgACBFQSuf/55eP6td1Z4RhcBAp0gsHv/3jC0a7RmqYX+oZo+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2u0DuUOvt2+GZLNK+QwdC70Bftts2AQIECBAgQIAAAQIZgT985dVwc24u02uTAIFOEtg3c6BmucWB+yt/P5r7V/aacXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGWBXJ+QlR4+cLRYDDUlWSefmGnlNZsbAQIECBAgQIAAgZYQiBVa37z6cUvMxSQIENg4gckjM2Frz7bqCVQCrYvB1upeWwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2tUCuUGsI889mde5268TsfrYJECBAgAABAgQIdLLAi++9H55/6+1OJrB2AgT+WqBULoVvVu54km3F7Q9ku2wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2NQC9Ydaxw6OVO6F+FRWZ/Lw49ku2wQIECBAgAABAgQILBP4/htnQ/zSCBAgsCTw2LcOLz288720JRR6t9/Z9ogAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAJheoO9RaKi18N2vTv3NHGJ+ayHbbJkCAAAECBAgQIECgIjA3Px/+8JVXQ6zSqhEgQGC5QN+OlX+fLg4MLd/NYwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbWqC+UOvY0e4QFp7Oykw9uUJlmexOtgkQIECAAAECBAh0oMC565+G3z31cjh9cbYDV2/JBAisRWDqWO3v1IWe/lDY2rOWw+1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoO0FSvWsoFy69p2FUKi6B2KpXArfPHSgnuEcQ4AAAQIECBAgQGDTCsQw6/NvvR3OXLq8addoYQQIpBHYtWc8DO0aDZc+OFc1YKFvZ1j4+WdVfTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIbEaBukKtt+cLv1bM1HidPDITtvZs24xG1kSAAAECBAgQIEBgXQJz8/Ph7NWPw48++FCYdV1ydiZAYPKJmfDCf32uCqI4cH+Yv3ohhNtfVPXbIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILDZBNYdai2N/9K3QyFMZSEmnziU7bJNgAABAgQIECBAoCME3v3kWjh3/Xp479q1EB/P3lBRsSNeeIsk0ACBfZU7oPz4z54PNz65fmf0QjEU+wbD/Mezd/o8IkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQILAJBdYdaq0EWp/JOoxPTYTBB4az3bYJECBAgAABAgQItI3AyfPnw5tXr95zvjfn5ioB1k/vuZ8dCBAgUI9AqVwKe6anwqkXXqw6vDg4ItRaJWKDAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgMwqsK9TavXdqbG6u8FQWYt/j09ku2wQIECBAgAABAgTaSuDKzVshfmkECBDYaIHp48fCKy++FOa+mLszla5yKPRuDws3Pr7T5xEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBTSZQXM965uZKz2b3jxVaY6VWjQABAgQIECBAgAABAgQIEMgv0DvQF3bv31szUHG7O6TUoOggQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDYVAJrD7WOHe0OYf472dVPPnEo22WbAAECBAgQIECAAAECBAgQyCFw4PjRmqML2/pCYWtPTb8OAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptFYM2h1nLpWiXQWqgEW++0rT3bwuSRmTsdHhEgQIAAAQIECBAgQIAAAQK5BUa+vjsM7RqtGafQf39Nnw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECm0VgzaHWhRCeyS76m4cOhFK5lO22TYAAAQIECBAgQIAAAQIECOQUeOxbh2tGKPbvDKGrXNOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBmEFhTqLVrfPqpSpXWvdkFr/QBW3Yf2wQIECBAgAABAgQIECBAgMD6BR6Zfiz0DvRVH1gohsVga3WvLQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbQmBNodZCYeHXsqsdn5oIfTt2ZLttEyBAgAABAgQIECBAgAABAgkE4p1RJg7P1IxU3D5c+bvTNf06X3OsDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKtLHDPT8HKjxycqnxaVqnUWt0mDz9e3WGLAAECBAgQIECAAAECBAgQSCoweeRQiOHWqtZVDoWe/qouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2g8A9Q60LC7VVWod2jYbd+/duhvVbAwECBAgQIECAAAECBAgQaFmB3oG+EO+Ukm3FwZFsl20CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECbS+weqh1bGp7CPPfya5y38yBbJdtAgQIECBAgAABAgQIECBAoAEC08eP1Yxa6O4Nha09Nf06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwKqh1nKp+HQIhe7lC4xVYiaPzCzv8pgAAQIECBAgQIAAAQIECBBokEC8W0r8yrbi4APZLtsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE2lpg1VDr/ELh2ezq9kxPhVK5lO22TYAAAQIECBAgQIAAAQIECDRIYMVqrb2Vm6t0lRt0RsMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaL7AXUOtXePTTxUKYSw7pZU+SMvuY5sAAQIECBAgQIAAAQIECBBIJzA+NRHinVOqWqEYigNDVV02CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwF1DrYVC4Znswlb8EC27k20CBAgQIECAAAECBAgQIEAgqUC8Y8rUsSM1YxYH7g+hEm7VCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECGwGgRU/+So/cnAqhIVvZxc4dexwtss2AQIECBAgQIAAAQIECBAg0ASBfYcOhBhurWpd5VDo3V7VZYMAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwqsGGqdn5+vqdI6tGs07Noz3q7rNG8CBAgQIECAAAECBAgQINDWAr0DfWHPgcdq1lAcfKCmTwcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBdhSoDbWOHRwpFBaezi7msW+p0po1sU2AAAECBAgQIECAAAECBJopMPVk7e/mha09IX5pBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBNpdoCbUWi7d/tUQCt3LFxarwTwyXVsNZvk+HhMgQIAAAQIECBAgQIAAAQKNFYh3URn5+u6akxR3PFjTp4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwnUhFoXQvhudhH7Dh0IpXIp222bAAECBAgQIECAAAECBAgQaLLAgeNHa85Y6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDsJVIVau8ann6pUaR1ZvoAYZp06dmR5l8cECBAgQIAAAQIECBAgQIDABgmMT02EeEeVqlYohuL24aouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaTaAq1FqZfE2V1t3799Z+WNZuqzRfAgQIECBAgAABAgQIECCwiQSmjx+rWU2xf2fl71Szv+bX7KaDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQMsKFJZmVn7k4NTCwvyppW3fCRAgQIAAAQIECBAgQIAAgfYSmJ99J8xf+6i9Jm22BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECHS1Qenj6q/WXlh7Nz88/W/gq4rrU6zsBAgQIECBAgAABAgQIECDQLgLF4bEQvzQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC7SjwZYx17OBIqXT77cp9CrvbcRHmTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0N4CxTj9Umn+aYHW9n4hzZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0M4ChTB2tLtUuhartI6080LMnQABAgQIECBAgAABAgQIEPhSYG6u+GB458QFHgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAu0k8P8BZhdTehAYdQ0AAAAASUVORK5CYII=" preserveAspectRatio="none" id="img2"></image><clipPath id="clip3"><rect x="0" y="0" width="7588716" height="3314007"/></clipPath><clipPath id="clip4"><rect x="1237" y="5547" width="789" height="241"/></clipPath><clipPath id="clip5"><rect x="1237" y="5547" width="789" height="241"/></clipPath><image width="180" height="207" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAALQAAADPCAYAAABcKPswAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAALSgAwAEAAAAAQAAAM8AAAAAyu+mMgAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAK9ZJREFUeAHtXQl8FEXW7+qZSTI5CTcoggcC4oeiIHihqLuoq+vtrqiA4ooHQsIRbghnuEXEAxFvPFDU9VpZFQQRQfFAuQQXQQ7lTAJkMpPp7vr+NclggJme6umeme6Zrt9v0p06Xr169e/Xr14dTQQ7xFQCY3f7biGK2F0R6Pjik9N+iGllNnGB2DKIjQQm/uE7W5HJY6DetaYGBdJ+xkFcY4Y3IftiU6tN1Qa0wRgo/pXWcaX7R1NB6AvSzhDky5FW3LCJ68k+hPhDpNtROiRgA1qH8GoXLaZUdO329yJEKAFgG9ROC3O/WRRIwbCmro/DpNvRUUjABnQUQju+yIQ/qjqLijAb8R2OT4v8P/0P8gwc3jR9U+S8do5IErABHUlCKukT99BGDsU/SaBCL2TTI0u/QOns9Kq0iYWnkjKVKu2kCBLQ0wkRSCdv8lxKXeW/+/tSIoxBK3MNbOl+mCzDPY1czxUTohhIN2VI2YDW2NVT9vj/QhQ6C8XaaCzKn50IawVKCoqauJbxF7JzMgnYgObEQcnvlS2cgnO6INCbOYvozobOWegX5KHDmri36SaWIgRsQEfo6Jk7qJs6q4bg2S9C1owI2WOR7MVDNFUhadMGNyYVsaggmWjagFbpzRl7MMtHyQxkOUUlW3ySCNlFqFJU0CjtdUIIPIN2CCUBG9AhpDLzd19bgYiPQThXhEhOaBSQ/JVIaUFBk7RvEsqISSu3AV2rYyYfpHkZVRLzXISb5auVO6G3cLCQFxSHY2RhA/J7QjkxWeU2oNEhlFLy+B7/PVQgk/BvQ5P1kRo7R+ADn0COOB/r15L41DKmSlrKA3rOH1WdKCFslq+jdTudbAWwBzzS2PmeddtgDOcpC+hZmOVzUnmSQGgviDIp5IBGfCZSpfDBxunrjIGH9agkRUdqETsWETnr7/H3haegGOWMnOXTwkYs88rQ1k+m+51j72tGDsayIjPSTilAP7nHfxU6gc3ynWXGzjCYp4N4+4zZ28A1F9PoksG0TUsuJQA9Zzdt7nJKM+Dyitssn4l6fD2zrx9o5PrERDzFjJWkBjSb5cvKkIrQoWyWzx0zKVqAMB7md52CXHRfw4xfLMBu1CwmLaDn7ZFuhrMWs3y0edTSSb6CVWjSTAdxlvRuQA4nX/OSZHRfu2OeO+A7S5ZFZicze9kOoSXwBxXo8H81cL2YbNPoSaOh52KWzyFLo9F/j+AXai9f6K5N4Vh0/hpZEQrub+RamSxisDyg2Szf8/v8vQRCStApVprlMxGG6AJZdA37Vz2y00RMRcWKpQH9/P6qjlgI/zhafkFUrbcL1ZaAh1JhcrbPOeP2ZqSydoKV7i0L6BcP0XrUK32AOb5OVhK42XklAv1NoOKgng2db5md11D8WRbQrDHM3Hj5gL8nFhVNxL9NQjXQjotaAssFhRb2bJj2fdQUElDQ0oAOymvhXprtE6Vh+L8Qv0TsKgmykmxXnPZEnvVXOUbda5HTnpIC0EEUvVZa2UKWndPw/y3BOPtqiATKsUlm3JG6rjlmP+3JtIB+9YD/auzj74sDXKZ1b6Bt9/OCvf4ughhYs3GuId1pEwlKYDMVSeFddZ3scBxTBtMBesFeb0vidEzHdPX1RyVG6asydQ2+W8PuDKyqE1sd9PeGfT0edGx33lFh6r/BNPrH2N84oHt98532ZBpA/3sfzfESaQQAWADPRVoIsWOqloxtVFec3VXD6rFXDtBcpyCNhHuvXxi6IaqyozgkgIMm6Zx00Tn+pnzznPaUcEAzT8WbB+SeWHfBJkYaRRYkWU9Epe/t+drMEAwczxCc8nT4Wv8euQ47B7cEqLCfiGTkhnzxWSxTTfhpTwkF9MID/guhddkZypoPOQQwX3VSx+BbNJghrJPe2O+/ihBxJrTL2dydZmfkkABZSxSl8LYGrs85MscsS0IAvXA/PUkUZXbs7J1omR4eDuPcjOJ6dcXHtZghCyl1iKXKA1igU4z668VMuqlJ+E2JOIbckU+2JaL5esCkmd+PKE2vKFUGYjaK+YyzNBMIW4CsU4jyiFYzZEEZzU+XcWwBIQ+BtL2gKax8NSd4CaXTK/zOKT3ifNpT3AC9qFS6Cef94Gw44VTN4uEv8CqVtZshb+2nrfHGgBkiXM1flZ2TQwK7kGfoTfmOV+O1TDXmgH77oO9sQh2zYFjE6xSigBmSp9EMYZ3zzkHpGtjW2BRAWnN0lp2FXwKrFEXpf0v92J/2FDNALyynddMUuRhtfhA/B3/bjcoJM0RQHrlZozdkKXaFl5UpfWEWjQIn+UZxY9PBzIIgvChLjhFaB/JaZGc4oNmAK6NU6UMxVQpG6mphJkZ5F0iSo0irEN/Gaj5RkcfBTLoffCXggYyRNBJP9gjMj4nOPHHWtcT4054MBfR7Zf4rCGXbn0znEjuMCZvinDravCGs7z+AySQQx0yoF3tLl7EPw1aQG3R9vvNdI8kaAuj3yuiphMrTQMzkxwRUmyF/12iGMIF/WCrdADcfFj6RM4zsAJuW8JlA5cLr6hpz2pMuQC/+g2ZJ6dJQuL0GomOss2yTCgscaY6iq7O0ndwJcyotq1zpjzfQCLQ31wajYRKQMVH2dJXDUXxzLjmgh2rUgP6oXLoTZv5kVH6SHgYSWDbgDcmIwgxZfJg2VGR5IsyQe8C/mMA2JFvVpVCOY9y54tNaJspqC0EzoD8sq+ogMjuZCBfVJmTde7KOUuWRa6MyQ6rOFUngyIQu1m2/CTknZAM7dLJbHe2nPXED+tMjtJEsyWyrE9NK3OVMKK7QLMEMoVGYIYzYf8uk26Ctp+C2BfvfDgZJgAjvUcEx6Oo8wn3aU0RgMrsx/7DSD6viRoLNZLcbD8EbMjYtV7s3BP7rDKlcGYgDEocA3NkGdalNRhCqINNZlT7nxBs4TntSBfQn5dLfMADClDBpmVKSpfCGOJS+3XJcy7W2++MK2kSU5BII9m6UVZWvVtopnn8PpDn8qhzHC2rT6CEFvuQQbaVQ+VEIMNXXNiyQnNq9IQx4n5bjzBAhYF9jiawdDJTAt9jf2P/K3NCnPR0D6E/YcVoOdpwW6QsGXAYyYWVShyCPsSQKM4Q1ekm51B0X5g06mf1vB8Mk8BqRHEO6HnfaUwDQbP/d5Yfl3qhqAuy/BoZVmVSEAktU+14ZhRny/m6amZ0lFWFnx2D4W1P6WF+DIeHB93GmZmSL0y4i1ac9kaXlVZ2J6HgCp7a0N7iyZCW3gIqOwV2zyR9aG7jMQ5tRWZ4M//0dWsva+VUl8JtChIFX5DgXkWXl8usYRTbFa/ViFDnGBFElkdqJh7BoaawcpRmy7BCFD5+NUYiFv7xlJgDQb8GNfFmuszNZfkjBGQu0GyLYl5PYrFcqfH8EzTQkrKNE7HtZDtHsDWGbg784LPcEqJlvv4kh3KQaESrshDLGiamkM5q+p0uuowlZwQBNAoBm4sCuXfolMrTBfX0WYQcOCWBSxi+KUZkhS3GMmcutDEMt9jFmHKKuyeIDRr8CVtlBncExyd5LchyNA+sQmJ1R8xPxyd1Lce/CAvfluMq10oJ57Ouf8mKyUAgRPGmUKQPtoWtDcgQdMcIhiWdB9otseR/FYkiciYR+DXNvD/B5OWTlri0vJn1nANEn9kMeyLL1CZvxq8TvnBOz2DGQ0RKRkAGds8mPeqVxYfUu6dtWHqaX4RlhcwD2MWa1hUqE7dDIe3Fg0AVQIGGDk4RBdE2JMwNXAjOEktNwb9t61YL5BTtyii7Kchi6OJ2RviiHLIN93WGVB25Uah9jBpF4oDi+BpgvxLV5QG1X90HIv+E09LGZacADcgQEPwdhjNBDHtV1bJnk/K8cfvqJOVni7LaEVMWqiZjaZebLvFUH6BskXWFraPrhF+p4tFixYA66lK7CctJmwNzlHAwF9Db5+shRLwdHmUCWbahgH8CdSi4ntgB9vksQR52XQ9D2+IZvy+kZkkinY8r37/GtOUG1EWEr7OQyKI/zNHCw74JsRyOnijkSjlYLgBk/itcAaSwIwin4JXEgn2GwNqBDDvkpUY08v3r55I3fHGGfdmbHmAnJeowZswLWYJKPzYmcpgWbAH8gOJlNoqVgTTlcCPtQjxevhM/BALvP/DMtCe6osIXZyR2zHP/W25rVXnqmI13Y2YEQ2IPRh47Zrk+xnLf9aR6lD6iMxS+JjjELuItP5zQvThBiEMbkuwrNJscJxBCxC4zsAMiZg9vqoQxT0xMV2MkAoF9PY1bjKASXQykGjQfw2w1//5DzMp2v66EZLPsjjjHzO5UxGPE/hDhnMN5qV5w6uxnmhRGetP3nZTkakh8AaOxm7maEIPCUfAdbMxcq34o7o2XIYB6+Qju6Qy7Zr0ceWOzvzKtUHkRHFYPOcYfV0JXYhVFwXhZerQaENYdoa4eDnfZErzGAXDxJlOPt/gPe7pegUiPOPdnfPghoCMMQQNdIQwKjK8AoM+hz4ymh6OsinxCFDDwnh7Dpf13hhyP+bkLguF6BzbaGC8zke8FFxRFto1jkFIro9xUSzm0hM/B2aRUq3URx0J8wLwhhfBq5svPAuVmOBmRtJUwOaiigg7KDN4D+DDPEzIueNqPtg8/Jcr4fZDra6w+wk4lCp6PN1/HSgAY/jLfZJLdbnNXSgFOE1lDqclUqD6P+0fjV4eUjXvnQ3o1UFGQ8dGfHoM4D52QC0D/GDtBBnn9CA9jgU01jBfPG61qGisZXucU5eu3kNdgU4cpQRsLcYr7iaDdFbMWDNahdljGnCMEMqZ/uVMbiNWCWY8wO4kHH248w80J9Ki9KBED+B/6PAXpd7AHNvCiYKKArsAGVPZl1o+TZiGLMTp6LdRNj2ug80ASzeeJ6NptHyATQNOjVSZYoCilsl22Mi3DDYXq2LNJH4b++0gjhaaUR534/eHamoz5ZHwdA1xJEKYANf27ADDFiIFCLdIRbQv4rSGRg2xyyPkLOiMkbK+llCo3Zegu2IGwuMeChCzZkfYV0Ax48mEPC6cG4mF8JliMrGOzF7818sC0D9EYA2igvB6+Q8Cr8WaSCDy6bdrxlos0XqIvQwW0ynB9ESyNYDkBugUV1U/D/bcG4GF5LIZ+xe9PFJ6M9Rag2b+txHAWpxDFmRBiB+JgN1vEw7gOeNtWYF/g3buHgWe4aQENrGunl4G4BGv4lZuGYi68RdyH+jKVwIY73uMUn9NrJaynNSvcq7LyNQag+g58F/TlR5yaRKANaZ7g+1k9NEH7CgUFOhzIetO7Fz0h7FuYcWQHz5lzIHas14x5K27gd9cjPzORIEKBrmnwYgsAWmsB0Z7SDqtrSkzAInatIYrEBdjL52SvfiYduMipoWruSuN9T8qEikoFtMshmI+reUkHby6L8KNrWRTc9IqyFzN2gU706UzfBqAiUtgoCGu+FhGjo49jeCj5KoZHOPy6e+1+UXSyi01umkw3chcJk3OzBbgic2IPkTmGyJCLaDw34uJgujj+dkHIjGNjiobdhip+ZUS2ioPcHyvwPP+aaTXQoPdMt1iNbvExDmwLQ1QIhwio87ewMC/bjDZtEgQw6PYN8xFsgXL5NFbSpw6mUYC3yXcgTTxswHEuh4vdBs446PV14tmapaag83HG/4hgzuUoYCM/NUBTK4ijIlgRgq17g+5LZHPljn4UKpS0ZoH8xG6Crm+7FZRV+TDuyV1nIAFutFB06dke6oHvgFOxU+IN5OzUkT/GMxNO2VqGksKWbfG5EvexhdjqVSXiY7wa9cA/zd0hjkzZsw4eZQtkZGWJdshWAxqvaDCZHKOHsxOKbnQBu5+MSJewpe8qVLo5tRgic9vrCVh+9FdppKqi00EcpYaXfEikpOtVNthnBwf+qaEc82LPwpsQukaNhF/piR4i+OJohwTdlpzNA/2pODX2MbDBl+i1cWGyRD9MK/4FWHtQinWw8JlMU//xaRc8VFMr2710WRXGzFWEfu5yRkSFObkxIhRHMbfPR7njQi0FrJ35mXyJcdioD9DafyWzo8D3hx+RAYYs08mT4LHwpWyhtkOZTxuMYqftQwkjXFR8Dsc21Gw/8sFNcwiu44uWrL2z3YdAo0Df0UYlL6fIW6WJ+oDOZsWSBnwuvQLYGI+rAFu/85qUD0qroZjwc96PNOLbBEm3X0j9NYS68uL2KfoU30PGmWlSys4qMWOP4NslGJQbjC+lRpdu89G+kCuuGiXAm66BkD2jjBQD2lzt88gJBFoc1yyTYhBFdsJK8nGxBhe73UnRy0lwqGj53+2gbvDLxncHEzIZqbqSxBYBFuB8d9KadPjrFnyZMP5UQ5kHiDvFdcMPNVqiMgefOyVLUDu4IVTJRcVj/wR22U5rv8Ctj8LgGtihZSctwN5I/I3zLdFyaX+gNYBednE7e5C9qDXwE+zeaXd9aZJGQvL/7aVfqpwvRFfUYA8HGJoQZc1XaHOPEN3ZXKQ83cZGrMGiUIrFnpTc4a4ulAM0LTCjy9sgbAHOkDkvR9C6/Vx9cExHQkA88ppYIATYtNSjktTgsZPclDClspwNPsJ6G1uM64JGIgXlETl5ZNquoFQPFExtSQLR+b3ZsWAtF1VImR6gGhIuzyGsyHPumireILKtNDiY5izDM7V4MaGhTQcK6zFjO5OB8i5uiR7TwapWH1BSCVWfCUtab04Ge5x1sqbc79qm8fAZsbd7MsWfb0jUENLSFtEP1xIpFRG5r6MR0lEXwHGAzKQeFVrP7EgNT/lotAuhAgywFaC1WhJU6gR9a8c9pNeXgxKbSpDOiA6aJVRaoxB+jgRo1fCyH4sNICeJSU7UBJpNSQzNAa9HmmsSWYpmtp6Et1Pn2oDAxT5Ml9HONaCzl5dDSnVbqBC3tSkRei8iylslhFY45VbTthzYO9g7YHNjpbfZQCTOfnW5VvXzUInjmtoutZveZHS2ceiQhzQB238GZFgMyCNkeALSZmT1eQlp4tcpDenwbzfa/iRX0BixC6u8i5LPaMrOUl6M242r3DPjmf0uqtcBcaSZTDuUwL8ajj2eH2nHjTEI3dAAN1nCdmgu44bgxiSyZjnoRQB4KIO8Nx2tAQ5vsCQzHK3e8FtOEm2iKZmTjEROEr8FDPwCZXVWD3feq4rETEyyBPaifnW51IQ+YGa/OgE/GKgYn78pcfKLItqKZDFRC4MwolfRgEvsIW/z3YGHzLp0jiGIxgHwoyIraFWfwsWOAs5yCHOh9tbzmSeO1jRTWJqs8pQkSr8hrTGA7bXwh8qmgOPqTdJHrME4AmQiyfJcgySXo8/ucVEHH8wIlQbIPVouvvQZvI1wVK0wGRGhDbJN5JYnPZGLDcTyUA9mGagaTdOci3pZTv/9C6pfYVxY6sjIYvFKYHHj8eF/lvDXFKh/hVBWBPfrx6IRYNdRMdJmG5oZ/NIxXQstOEzPSJsO84DqmjHo8zRTiLKEy7X5shYQBGlEWmNsMMM4rV4UhmjfzsSKx/ztOAuwomthp6LcJUQaKbvf242oN+S+Anyl4qwYDrkU4C9t9glUpKgB0wN4MWd58kZwKuvrDtTagDepAnHPJLXjeKtfjZduf5LiX8BQI2MkV3juox8vWa5wctozENHRAmVnk9czr4Wfyt8pbJ2zvmCRBhormlXtkltks31ghyz0H5gXT/REDLfdcQI9UMju58wka+cTSCpaPWmcApXC+TaqzWeQhPbFT4hJDcLgdT6jWd7plyQg8jw+IDifZOWFn+WrzAzv5JEWiE4HOuwFkrtetTGRmcuhmtjYfsb3nahZYYK5I3ryx5dj61HW+wWGvrHYQsR/Jy/qGRxh0B3UruZ6BShUdgtcs8y3zB1RmLRua450TaD1T0TagIwCBU0Uzm5SZcNrDHjjPhjvq5LwA84JLa0oHDt+ukMNTUeUp2qtja6EDXg6uuqKhb3wZLSC1bWhj5K9dQ7OPcs4RBWksqVuPb5ZvX1kHWST4TLNyMa/OCtm4AKCtZHLwql3bbReyv6OKhLOXd54CevgTIjoKSP0cvlm+fRVNFOKfAAj2IviCaFT81S4k+6nTUgMoXvcRXpHohdpNte+jlQA0NMdM4TaQH+holP8OTzX0118zlKy8QlmpGob8hn1amX2NFTY0ONb/bPC0w4A8vIwGEG1AfTYJQYB3jYZdlMn28k0RPYemklNP5Zrlk/YcuAU6fxqcES2Y2jE2BGxo1vm8QDG2eu3UwCtPYHaf4cLiqTgJ84S3od9yCOIg0rTubzytrtq1t71IxEcFmXbhyR9VHkWAH5o9JFYZQPE+0NV2VFQySZlCYZVuCAnUGmfhrb5eEcT+rpPrLwmR84QoumdPI1kSxwNn9wJnWmo9gVakCLbO1Vp+aF7biClyPk9RJBnZ6exVV63wyvAeHyue3OAJB8csH11P05Tcvf3lKjISJHLiI0gMCqnMbGhrmBy8GMWbB75TXnUeH1Gbrhbe5dB+SQE+5jvS6HDSpMk+nnZIv+25UaZ7pmEMf3pcTT+RQkNbyeTgffDYQNcy4wIeiCQuj6N5448wMfIhDwe+rXvaYXr7USorXXnyG55HCqyHtpAm49W64Qcyhssw2QnyzPLRLb83kEU6jgryfVCQvLo/FqKzmA3Nq6EpXQWzbwcMqWaxkFoS0HxP+Llpld520DVrXFJe40ckoowCkPMCb3u9RHWVZ4PCJFzI4zrjpJV09+420mG5COPIwZCRW5eckqUwEdbBpzzAdeZJn+ptkrRl1/WSoEzHgLGlabxkxIGZQjxWvIMtvULQW17h1dCoiDRt6sGl2PPLrvlOWZ6C+3/qrd/C5Q9AImNdLU9+CiYEM8iiDr4t29sKMpmpKPJfoiYSs4IK/NABn601vBzRLPvKPOOkHZBfd/+m7U/ADGELxc+PmTzNR5gton/a5RfGkHanlOphj27cWc8vKMWCRB/AiNuRePMiRGvEKpgcLFhlYkXHc+dq3fxLbOW5wL9h+z1wgEwAqcYhRJI0Udjd/180ZkB62+Yb9DSK2cn+zAYP+ak0GnTy9dCKdVnAuMaGjnVNRtHXaRvVjNifo5s2vVkluUYQgfQHa+lGsWcSOlvghR+UcfZp7+vlR1q/7Ro/VWZivU8rvbTiUl5ifmg2q2bK90fsREBatz4M6kO9P22Z56DidDgub4hdbXGjfAgP7ASnWDmbtG2ry4Ph+2lbG4HKMxRZvjpu3BtREV7BmFhhw0KLBA2DQp4WZfxfy/8h302Va/93pSjQR3F/Nk85k+Vh3vnnsKp+ZE6707n264Xjn67fUdcv+UZTRXoIeZyWwUWwQdWABtuW4TzwOgmyb9jVfc7pn9GFtL2/5S99UMNY2Nf1DCMeQ0LotuU4oq4w/dyW3+uphi5d6vTlN3ugqspbDDp19dBKaFnL2dAxnM4mtwfcWU/SH398ze/PGAPZBLRUQjsofOXboYWGZJzfemH4LHwp3m83dqui4gwiy2fxlTBzruBqOx3eg/g2L7yG9q3ZOIzKwqKMTm026+GJtGtXivIFvm83z4VLcyYmZrrpoWdw2QrQm5p2OH0a6cq3oD5c/d7VG88kIsHEiHBd4BVtmbd0uBahFZSy9dAWMjlUhI6tb9diUDS26uuNT7lEcTzp0Gp/+KZHTkk//0y2L+4adPx1MN2n4/7MyKViloN10quy5BiaeVGrXXpqod9/X8cvZYxCt/dF37usY25ytJqZHAzPVvFDA7DhWwXHK9rCBjKP+BS5h3f1hknpHvfjejUZNP4H8MUu9klZ/SAorO3FmoU4BrT4a0ztFWR2PmuVnmoxRnD4mm/4l6+KjEN/19dDy8RlmdtO10xofNumdgpm9ZMZ5IctlJniy6h4yLdy3fC0C9u+XuODDqZrupIOHdjW/BmHl3/3ksuZNgHrg3sD3DHdfYGp6t14hQ5Pv7Dty3p4Zw2tXLn+Sh/Z8CjWiMOLo/Ka0yQVE2aGTqtZnKSi+czEt8pxumwbfIiuao64Bd6V6woqv/xxsPvidsv1NCeny3lsgXufqi/XP6UQuPmocJkeemHKesHzzIwjcgnpdg6zmaMO3pXrz4Bbdhp+NyQzjv8UkOUW+P/J+gl3Kmul8bh2RP7PvV/8+C7WgwzNuPRcXQPHtIvb/gB6XX3L195KCZmK+xb4GREWUUke7O7afpseYnTVllyf5B2Jk+1hJglpemhZrCxem+xVbZWfmnXEPk0QuR03YifLOu+ytY8dWrpGtx2Z3uWct9KlnLNQMdYD0wqO+kPLWqFrsTj+ioxL292mB8wwUcTK5T/+y+f3bIaHZhD4SYuap8iyDN2WBJbD1ygwKFTRbGZ7OqmKyaHhwBy2IOuRNOLs4fn8+xI3rTNbz8CxpuzEihXfPS9KpAS2712gz2vD7cM4dzQejHmwk8P7JDk6onLp95f7lq1ls53nQEWlZgjMFFpqy78KThRZJfHE/kWn56HAZK9Q+qDns++Gu69or2vgmHXJebtRS0/PkjVPEkFky1Q7nVjr0Rg/6p6TLivjyF86lB+NjeLG++k3pwmiYyr68mb8oqCQREVkth6ajRasIgc1JcbeNJogfbQjm6PYAu+S7wsqP/tusPvK83QNHDOv6LAawLoI9KCpKTS20PRoTeyGCh8RwTEg4yp9djxdsSLH680chqFwIcyK9GPqSNF/qMI+SRH4rFt0SDCV3AI2tB6OaPXA8dM1GDjKQzOu6hT1wLHGzfYyXbz2ba/oHwquBuK3DfEA8vkf6+KymIrei9f08laSCRjgNraMMtLTaN6yaYFvrLDcFlHRams5DBoLQBI3CoJ4nWfx109JaeL43K4dop5xrHG7jar473dPZbrK95KuXSXevgmVr/Ljby/1kTXwJwvnWabPQjUkVnEBDW2h5aNq7xFKmQ2tlkOTFAMDR6dP7uH5eHWJO92DgWNXryYKtTJn/TVgX9eK0XZb+eGXzanDOZUK0m2pbiarSQ7ngbD10MhiFSmpaWjm0gs1taImgchpbJp7cmWl+8HKD1ePyLj2gtf0ztpFrvLPHFjWme3xuIeg2QMxy5fxZ4p9F1oCGfBDMy9HAn2HmuoOfLIrdFPQCB4/dLRtbU4F5ZXKj1athrbsEo4Do+IxqCSVH3zVs9KTsQmNGgEZZWiSk1X602A+qzU0Dus17k1tVJdGQccgGzpCzR1w6MPnnvdXviuKjqEZf4t+4BiuHs8HKy8CmJk/uaNVhjbh2hL3+KANbRnBqY1dFbWVS4aL9kZ8Yu46z7srnpYc/nG513eNeuAY5Mzz7rJm8CdPhtfpjmCcfdUmAeqSFCdmC7WVSmBuHIMZtvYETCqwgWNfh+y8+8g7y0uy6shRDRzp+2syPVJlEXphMOxk+4SnsD0cOQEn6bLlo9YBtOoneo346ExkmYXKEZhx9JQ6HjyyaNmIrJu7cA0cmZ1c8c7y7h6/h02+nGyZt2QoCZglTmGAjo13IDZNDK+gawZ7samWk2pzsPeKZ9HygspFywa5b7ks7Iyj560vOnneXv4onDKdwTgneTtbJAlQJROAZoi2ikzV3iYyvBzGu+0iyTBUegew+XnFm0vfdciYcfznVUdnHD0LPztJEEkJfOZ3wmuh9niGomvHRZAA9UND4/PI+ywjWjUIsE++mevBvFEm4nUVbyx5mjjJVOx5vBfDlSLoj8xozuiL0JepnrwN0Bha585LS53uTct7eVpd+gWwMA5SaWRuyYRHNJbBhk9MXKMCA0daRduBhZj7rxPXzMTUjA7Hjh4yNTPPO5Vce62PceEkxcVQbcK8ffNXvO7O9MKJL/TH/+ZcvaW22s7c3hpzvTtYz1s7MHm+KhBlSNYdf2XLdo8GpkECoUHvSwLnvVW+tHiu4hCmwcS7OZhmnquKEo6vH9o8IkkxToDk1Q6FFmT26LY6VNOPAjqY6O7R7Vfc33r4hcVdiCiwGav2wbTEX9nLJEwITKOGSUt4NHsQbSWtsxt2Q4Qjsnv89SW19TRiuEpyenVbnrX1q45wHNyHEfkfplhLEI5ZFs9s6ACoARyzXdkRr2bjyTL8CF747EuyBKFVds9uL6qBmcHgBA3NIoOhxr5+bu8TSxe6MyvxoXFSiLSErfqCXy7I2glXK+2NPIF5OyKcBN7Bl9oG5t9zzbZwGY6PVwV0MHPDh7sewf2I0uf/M8+pKOx7JbcF0+J6VXtrm9uGVuM8riK0RGVUWAt/fUHOvdcu08ovF6CDRGuelH8cmv/+49jVMRNnb3UIpsXnGl5DV5uoNm7i0w8xqoXQfYJCirN3rZlbYx1orkgToIPUc3tfvwJ2TaeKZz7sgU2aExF/7EbQYEajr8zuCxfMraHDcW3HV0vAjzHGk37BXVy3z1907YKPCtCMhxrj/MU/Xlr8VlZl1RCMethG0BivFrOqhmZ8qzyMTKApGiCVjwWZFuY+dMPPRoggakAHK2/coxtma4TRlXPff1ZSlMnouH8G0wy/qmFC47kchvOmRtDGcyjpbIJSHJDzwA26dsEfT1g3oIME3X2u/w333Q/NeftxqG/mv74gmGbUVe3Dm+b2crAnUeXtYpSArEGnDIvIJuTszZ9NivXtgg/VXMMAHSSe2/fmr2BfX3hozrt3gvFJ6MaTg2m6r2qnPJl+9Zra60W3ZKxAgC1Uni+4XCNz+1yve4dPuAYbDmhWUY19/cruue+/neWtKsL5bYMQjVVmOoOaklNbWqqzWru4bgksFRVamF1w64+6KUUgEBNAB+ts2uf66u9tz17E7Gu2M6M7fmqwDBYNfVVb72xqL0fKmhy/wrU7JKfw9rdCd6jxsTEFdJDdzH637MT93eWz3ppDqj/Ec2EwTdNVZSlHYCtZ9I+KJjY0Z2Z8qbkcNRM0fYEjaPKUbEf2dNKvellnvDiOC6CDjckruJWtkLq4fObC7uhgprGbBdO4rirfWGF79MzqGQvgmauBls+EUx7oAidxDckqvOX3RLQmroAONjBvwO2v0pkL3zkky7CtSRHisfaEJ6ioaDPb0CmBaLpKUBwFeUP+8TVPT8YqT0IAzRpDBtxeicv4iimvzZdEOgna9W4WzdLCBhUNjR3h6mXDEo1DQhI7OCD0XXg5Ds8t+ucrkVbCxUHSQtjlo/GonNWRNeSO3XmDu/eCUDrBDv4yYAszbRvqJzPPT5gQKr8dF1qOBsgFfn8vDsWZ5CFS67whd+j+UleYXtUcnTANfTyndYbcsQZxl5aVvPIPODMw4yg0Pz6Pqo1sYhv6hHZYPAJaeRGcSoPqjLxru9maYhpABwVTZ9hdb9Di5/9dnuYcAAAPRXx2MM2q66HxgVM0wbwWUVC+Ea9U+IEKYkH+yDuXR8yboAymAzSTAym+x4vLpIqJLz/vp8oE4KEn/scXu1RAIcOGVklmdBMWGF/Wdtux7zOOzpN/nRftss54yd6UgA42PmvE3cz107us+LknMP3IPiQfPgS2FIVPtlOikoAfEyNz5IyqcXWH9tG1rDOq2qMoZGpAB9tTp/je73B/+b4p83OCcSdcmQ1t2mA9kwMcf+QQxAG5xb2OnvxkWvHWYswSgA7y22BIb3bUQujARu5mDZhuMMkxZRElBCluFEUysE7xPYYu64xYsUEZLAVotTbj9Hbz2tCMcRM/bzVyLcMYZVwd1445sJN1fdxIrZ9inZY0gIbDFZ8mNrHVEeuejJ6+DDfpPJGmjcot6XkgejLmKJk0gCay1JUS5ziI9QH8HOYQr9m5IEvwscrC/Cl9fjI7p7z8JZ1KOzhk7v9hCnY23vGX8Qoh9vnIMlPxQ4St4Kcov+SBt2Pf9vjWkHSADoqvbPDT/8CO9Kn4X9uKviABY69sIqKLsSSjonYEqxIn53ulGeTxfoHTOqOiYuJCSQtoJvPdxXMz049IQzFaZDtmEnbiE+pONKCZl/5lRXIMazCrT0KWdbL+iEdIakAHBVhW+ORpikhn4P8bgnFxvRIAmiZMQ3+FRbcF9Wc8/E1c25ygylIC0EHZHih84q/YBTAL/7cOxsXlSoQvAOhL41LXn5Xsgu97WP7Mvgss4wT/k/eo71IK0ExK9P65roNuXz908ij8mxu15LQV/ALZ4wXoSngvZ/g8aZObPtOH7elMqZBygA727p7+sxrBZzmZUNIDcbGWQ7wA/SaRHUX5T/TdHmxnql1j3ZGml+f+h2d1qnHzdYwhszEFNDrxe0yOFOY/UWDaZZ0xlO0xpFMe0EwaWK9MDj702L24YwdPNjxGQsb8swJkLjGG1DFU9uLlMqpuw7L5Zl/WeQzXMfzHBnQt4R68f3IeFdPGwADpi2gjZ1GNBrQfD99swekaX+/xfodqNSHlb21Ah4DAgT7Tz0I084ZcFSI5iigCQFNDNDT8yR8qgjKg4dyiLVEwkvRFbECrdPH++6bfTAidjiwtVLLxJH2JTBfzZFTJsxHmxYB68wYtVsmT8kliyktARQD1nx309iF/xVkwssdiyrgSP+yk0v5jZ0xGU66mTCmqLKh7UsU5NphVOqsmydbQkWUUyFHae3pzmcrT8M+tnEWOZoOZsBKCvuhoBN+NjGzPpEny6NyXhlt+WSdfk/XnsgGtUYb77ym5Al9pewwDx7a8RTUDmgqfiU6lsO78Eet467DzVUvABnQUSKCXFzsPNE97CEWL8asTkQQlX2EamueAyq3wIA6u/9KwdyLStDOElIAN6JBi4Yv8/Z5JDdIkZSK2DMKHrXoK1VdIVwP0YexqLynNy360ZZIu6+STqP5cNqD1y1DY133c+UQUZ8O0CAlaDO5WYTayc4iq2KaxFxWnNLzhC8V/hEi3ozRKwAa0RoGFy85mGw90H383QD0ZeRrXzoe4VRD08YBeKShiQYPXR7Ij0OxgkARsQBskyCCZfX+fkiNkeUfB1dcfcS4WD0CvhqA71eTZCX/y0Aavj3615n/7YqAEbEAbKMzapPbdNr6VIMrsa2BX1wC6HTwj0/wVwtSm7xen3LLO2rKJ5b0N6FhKF7T33T76ekLFTpjBeqbum8W/xbi6lCf//wfwv5oLyY34AAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img6"></image><clipPath id="clip7"><rect x="1824" y="5560" width="180" height="207"/></clipPath><linearGradient x1="2.53282" y1="25.2686" x2="2.53282" y2="-99.4357" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill8"><stop offset="0" stop-color="#E73768" stop-opacity="1"/><stop offset="0.5" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="1" stop-color="#69E0F9" stop-opacity="1"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-1237 -5367)"><g clip-path="url(#clip1)"><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6046)">A_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6046)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6046)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6046)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6046)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6046)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6046)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6046)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6046)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6046)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6111)">B_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6111)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6111)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6111)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6111)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6111)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6111)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6111)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6111)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6111)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6176)">C_local</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6176)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6176)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6176)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6176)">alloc_fragment</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6176)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6176)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 6176)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 6176)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 6176)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5843.05 6176)">accum_dtype</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6158.16 6176)">)</text><rect x="4456.5" y="5986.5" width="1769" height="211" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4370.5" y="5690.5" width="1855" height="193" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5429)">import</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4468.05 5429)">tilelang.language</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5429)">as</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5429)">T</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5560)">def</text><text fill="#0000FF" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5560)">Matmul</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5560)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5560)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 5560)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 5560)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5442.01 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5642.53 5560)">):</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5732)">with</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5732)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5732)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5732)">Kernel</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 5732)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5797)">(N,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 5797)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 5797)">),</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 5797)">(M,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5797)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 5797)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 5797)">threads</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 5797)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5928.99 5797)">128</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5862)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4439.41 5862)">as</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5862)">(bx,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5862)">by):</text><rect x="6180.5" y="5981.5" width="45" height="143" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6267)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 6267)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 6267)">clear</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 6267)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6267)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6267)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6451)">for</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6451)">k</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6451)">in</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6451)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6451)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6451)">Pipelined</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 6451)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6451)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6451)">(K,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 6451)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 6451)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 6451)">num_stages</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5986.28 6451)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6014.93 6451)">3</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6043.58 6451)">):</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6608)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6608)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6608)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6608)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6608)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6608)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 6608)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6608)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6608)">k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6608)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6608)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6608)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6608)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6673)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6673)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6673)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6673)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6673)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6673)">[k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 6673)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5184.2 6673)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6673)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6673)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6673)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6673)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6673)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6844)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6844)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6844)">gemm</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6844)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6844)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6844)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6844)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6844)">)</text><rect x="4370.5" y="5615.5" width="1855" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4917.51 5673)">Kernel Context Initialization</text><rect x="6180.5" y="6124.5" width="45" height="73" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#B4E5A2" fill-opacity="1"/><rect x="4456.5" y="5912.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5109.45 5970)">Buffer Allocation</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6181)">Register</text><rect x="4456.5" y="6224.5" width="1769" height="63" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5431.51 6272)">Initialize Accumulate Buffer with Zero</text><rect x="4456.5" y="6395.5" width="1769" height="523" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6321.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4852.81 6380)">Main Loop with Pipeline Annotation</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 7081)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 7081)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 7081)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 7081)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 7081)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 7081)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 7081)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 7081)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5155.55 7081)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 7081)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 7081)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 7081)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 7081)">])</text><rect x="4456.5" y="7027.5" width="1769" height="84" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6953.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4937.86 7012)">Write Back to Global Memory</text><rect x="4580.5" y="6543.5" width="1645" height="153" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6469.5" width="1645" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4827.91 6527)">Copy Data from Global to Shared Memory</text><rect x="4580.5" y="6786.5" width="1645" height="92" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6713.5" width="1645" height="75" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5311.86 6771)">GEMM</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6054)">Shared</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6114)">Memory</text><rect x="2093" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#73BBBE" fill-opacity="1"/><rect x="3523" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#8ED973" fill-opacity="1"/><text fill="#73BBBE" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2070.16 5779)">Global Memory</text><rect x="2808" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#E57C62" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2785.16 5779)">Shared Memory</text><text fill="#8ED973" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3534.24 5779)">Register Files</text><path d="M4154 5390 4154 7114.23" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="41.25 30.9375" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1620.74 7280)">(a) Efficient GEMM with Multi</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2978.85 7280)">-</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3016.09 7280)">Level Tiling on GPUs</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4290.32 7280)">(b) Describing Tiled GPU GEMM with </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6019.39 7280)">TileLang</text><g transform="matrix(0.000360892 0 0 0.000360892 1339 5961)"><g clip-path="url(#clip3)" transform="matrix(1.00011 0 0 1 -0.0235485 0.164795)"><use width="100%" height="100%" xlink:href="#img2" opacity="1" transform="scale(2768.59 2768.59)"></use></g></g><g clip-path="url(#clip4)"><g clip-path="url(#clip5)"><g><path d="M0 0 1013.13 0 1013.13 744.948 0 744.948Z" fill="#0A0619" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1131.64 5290.99)"/><g clip-path="url(#clip7)"><use width="100%" height="100%" xlink:href="#img6" transform="translate(1824 5560)"></use></g><path d="M69.3696-87.6953 2.53282-87.6953 2.53282-65.7044 23.3616-65.7044 23.3616 0 48.3918 0 48.3918-65.7044 69.3696-65.7044ZM90.7347-75.2994C101.849-75.2994 103.727-76.4317 103.727-87.5761 103.727-98.5417 101.849-99.4357 90.7347-99.4357 79.7392-99.4357 77.713-98.5417 77.713-87.5761 77.713-76.4317 79.7392-75.2994 90.7347-75.2994ZM78.7261 0 102.713 0 102.713-68.2372 78.7261-68.2372ZM140.259 0.744948C143.149 0.744948 145.95 0.506565 148.721 0L148.721-18.0575C147.321-17.7 146.934-17.8192 146.069-17.8192 142.136-17.8192 140.885-19.458 140.885-25.1494L140.885-96.0387 116.867-96.0387 116.867-19.458C116.867-5.6914 121.427 0.744948 140.259 0.744948ZM220.624-38.5287C220.624-59.3873 214.038-69.4888 189.783-69.4888 167.166-69.4888 154.025-62.1585 154.025-34.1186 154.025-6.07878 167.166 1.25151 188.651 1.25151 202.149 1.25151 213.025-1.25151 217.197-4.17171L217.197-21.8717C213.145-19.458 202.656-16.9252 193.448-16.9252 184.717-16.9252 179.294-19.3389 177.894-25.2686L219.849-27.8015C220.236-28.8146 220.624-32.9863 220.624-38.5287ZM177.774-41.5681C178.281-50.0307 181.827-51.9378 189.902-51.9378 197.501-51.9378 199.378-48.3918 199.378-43.207ZM258.02-21.6035 258.02-87.6953 232.871-87.6953 232.871 0 290.47 0 290.47-21.6035ZM330.399-69.3696C325.483-69.3696 318.152-69.1014 312.968-68.2372L312.968-48.1534C317.407-48.8984 322.056-49.2858 327.002-49.2858 336.985-49.2858 339.518-48.2726 339.875-41.3297L325.87-41.3297C306.025-41.3297 297.056-35.5191 297.056-19.8454 297.056-5.18484 306.025 1.25151 320.298 1.25151 332.306 1.25151 338.117-2.9202 340.143-6.31716L342.021 0 363.773 0 363.773-46.3656C363.773-62.2777 354.297-69.3696 330.399-69.3696ZM327.896-16.5378C322.95-16.5378 320.298-17.4318 320.298-20.9777 320.298-25.0303 322.712-26.1626 330.28-26.1626L339.875-26.1626 339.875-20.8585C337.998-18.4449 333.826-16.5378 327.896-16.5378ZM424.531-69.4888C413.416-69.4888 406.861-65.9428 403.434-61.2943L403.434-68.2372 379.447-68.2372 379.447 0 403.434 0 403.434-44.9949C404.447-48.3918 407.099-50.5373 413.178-50.5373 420.747-50.5373 422.892-49.1368 422.892-40.6742L422.892 0 446.909 0 446.909-46.1272C446.909-62.665 440.592-69.4888 424.531-69.4888ZM505.015-68.2372 505.015-62.2777C502.363-67.3433 497.208-69.4888 485.557-69.4888 463.954-69.4888 458.411-53.4575 458.411-34.7444 458.411-14.0348 463.954 0 485.557 0 497.059 0 502.363-2.9202 505.015-7.83685L505.015-6.07878C505.015 4.79747 499.592 7.44948 485.051 7.44948 479.121 7.44948 471.403 6.43635 465.98 5.06565L465.98 23.6298C472.416 24.6429 481.147 25.2686 487.583 25.2686 519.944 25.2686 528.794 13.0217 528.794-5.6914L528.794-68.2372ZM493.901-16.5378C484.186-16.5378 482.667-24.8813 482.667-34.7444 482.667-43.9817 484.186-52.2954 493.901-52.2954 505.403-52.2954 506.177-45.7398 506.177-34.7444 506.177-23.1232 505.403-16.5378 493.901-16.5378Z" fill="url(#fill8)" fill-rule="nonzero" transform="matrix(1.00126 0 0 1 1271.86 5696.36)"/><path d="M0.238383-25.0303 0.238383-20.9181 7.80706-20.9181 7.80706 0 12.6343 0 12.6343-20.9181 20.1434-20.9181 20.1434-25.0303ZM22.8252 0 27.6525 0 27.6525-25.0303 22.8252-25.0303ZM32.5393 0 50.7459 0 50.7459-4.70807 37.3964-4.70807 37.3964-25.0303 32.5393-25.0303ZM53.6661 0 72.2004 0 72.2004-4.70807 58.5231-4.70807 58.5231-10.5783 71.0383-10.5783 71.0383-15.0182 58.5231-15.0182 58.5231-20.352 72.2004-20.352 72.2004-25.0303 53.6661-25.0303ZM85.9372 0 104.114 0 104.114-4.70807 90.7943-4.70807 90.7943-25.0303 85.9372-25.0303ZM119.996-25.0303 113.888-25.0303 104.918 0 110.133 0 111.712-4.64848 122.589-4.64848 124.257 0 129.83 0ZM112.964-8.40301 116.778-19.8454 117.136-19.8454 121.248-8.40301ZM132.452 0 137.13 0 137.13-17.0146 137.309-17.0146 149.347 0 154.085 0 154.085-25.0303 149.466-25.0303 149.466-7.80706 149.288-7.80706 137.19-25.0303 132.452-25.0303ZM179.026-3.75454 178.847 0 183.376 0C183.376-0.178788 183.376-13.1707 183.376-13.3495L170.057-13.3495 170.057-9.83331 178.937-9.83331C178.579-6.79393 175.45-3.63535 170.921-3.63535 165.378-3.63535 162.577-7.50908 162.577-12.5151 162.577-17.3722 165.945-21.1267 170.951-21.1267 174.795-21.1267 177.238-19.3389 178.192-16.4783L183.287-16.4783C182.065-22.7358 177.745-25.3878 170.921-25.3878 162.935-25.3878 157.899-20.3818 157.899-12.5151 157.899-4.67827 162.458 0.357575 170.414 0.357575 176.106 0.357575 178.281-2.38383 178.847-3.75454ZM203.579-9.77372C203.579-6.19797 201.196-4.3505 197.769-4.3505 194.223-4.3505 192.226-6.19797 192.226-9.77372L192.226-25.0303 187.34-25.0303 187.34-9.05857C187.34-2.65202 192.197 0.357575 197.799 0.357575 203.609 0.357575 208.317-2.65202 208.317-9.05857L208.317-25.0303 203.579-25.0303ZM224.885-25.0303 218.776-25.0303 209.777 0 215.022 0 216.571-4.64848 227.448-4.64848 229.146 0 234.718 0ZM217.823-8.40301 221.667-19.8454 222.024-19.8454 226.136-8.40301ZM256.262-3.75454 256.083 0 260.613 0C260.613-0.178788 260.613-13.1707 260.613-13.3495L247.293-13.3495 247.293-9.83331 256.173-9.83331C255.815-6.79393 252.686-3.63535 248.157-3.63535 242.615-3.63535 239.814-7.50908 239.814-12.5151 239.814-17.3722 243.181-21.1267 248.187-21.1267 252.031-21.1267 254.474-19.3389 255.428-16.4783L260.523-16.4783C259.302-22.7358 254.981-25.3878 248.157-25.3878 240.171-25.3878 235.135-20.3818 235.135-12.5151 235.135-4.67827 239.695 0.357575 247.651 0.357575 253.342 0.357575 255.517-2.38383 256.083-3.75454ZM264.784 0 283.319 0 283.319-4.70807 269.641-4.70807 269.641-10.5783 282.157-10.5783 282.157-15.0182 269.641-15.0182 269.641-20.352 283.319-20.352 283.319-25.0303 264.784-25.0303Z" fill="#FFFFFF" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1395.89 5764.9)"/></g></g></g></g></g></svg>
diff --git a/images/logo-row.svg b/images/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/images/logo-row.svg
+++ b/images/logo-row.svg
@@ -1 +1 @@
-<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
\ No newline at end of file
+<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
index 33a581296..000bdb948 100644
--- a/maint/gemm_v2/correctness_evaluation.py
+++ b/maint/gemm_v2/correctness_evaluation.py
@@ -2,6 +2,8 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -24,13 +26,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -66,20 +66,19 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -147,13 +146,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -234,13 +231,11 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -322,13 +317,11 @@ def matmul_rr(
     A_frag_shape = A_shared_shape
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -394,37 +387,64 @@ def run_gemm_rr(
 N_VALUES = [16, 32, 64, 128, 256, 512]
 K_VALUES = [16, 32, 64, 128]
 K_VALUES_8Bit = [32, 64, 128]
-FALSE_TRUE_CASES = ([
-    pytest.param(
-        k,
-        "float16",
-        "float16",
-        "float16",
-        id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
-] + [pytest.param(
-    k,
-    "int8",
-    "int32",
-    "int32",
-    id="K32-int8-int32-int32",
-) for k in K_VALUES_8Bit] + [
-    pytest.param(
-        k,
-        "float8_e5m2",
-        "float32",
-        "float32",
-        id="K32-float8_e5m2-float32-float32",
-    ) for k in K_VALUES_8Bit
-] + [
-    pytest.param(
-        k,
-        "float8_e4m3",
-        "float32",
-        "float32",
-        id="K32-float8_e4m3-float32-float32",
-    ) for k in K_VALUES_8Bit
-])
+NUM_THREADS_VALUES = [128, 256]
+
+
+def _generate_dtype_cases(k_values, num_threads):
+    """Generate dtype test cases for given K values and num_threads."""
+    return (
+        [
+            pytest.param(
+                k,
+                T.float16,
+                T.float16,
+                T.float16,
+                num_threads,
+                id=f"K{k}-float16-float16-float16-threads{num_threads}",
+            )
+            for k in k_values
+        ]
+        + [
+            pytest.param(
+                k,
+                T.int8,
+                T.int32,
+                T.int32,
+                num_threads,
+                id=f"K{k}-int8-int32-int32-threads{num_threads}",
+            )
+            for k in K_VALUES_8Bit
+        ]
+        + [
+            pytest.param(
+                k,
+                T.float8_e5m2,
+                T.float32,
+                T.float32,
+                num_threads,
+                id=f"K{k}-float8_e5m2-float32-float32-threads{num_threads}",
+            )
+            for k in K_VALUES_8Bit
+        ]
+        + [
+            pytest.param(
+                k,
+                T.float8_e4m3fn,
+                T.float32,
+                T.float32,
+                num_threads,
+                id=f"K{k}-float8_e4m3fn-float32-float32-threads{num_threads}",
+            )
+            for k in K_VALUES_8Bit
+        ]
+    )
+
+
+# num_threads=128 can work with any N
+FALSE_TRUE_CASES_128 = _generate_dtype_cases(K_VALUES, 128)
+# num_threads=256 requires N >= 32
+FALSE_TRUE_CASES_256 = _generate_dtype_cases(K_VALUES, 256)
+FALSE_TRUE_CASES = FALSE_TRUE_CASES_128 + FALSE_TRUE_CASES_256
 
 
 def _ensure_torch_dtypes(*dtype_names):
@@ -435,52 +455,58 @@ def _ensure_torch_dtypes(*dtype_names):
             pytest.skip(f"Torch does not expose dtype {name}")
 
 
-def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
-    run_gemm_rs(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
+def _skip_if_threads_exceed_n(num_threads, n):
+    """Skip test if num_threads=256 and N < 32."""
+    if num_threads == 256 and n < 32:
+        pytest.skip(f"num_threads=256 requires N >= 32, but N={n}")
+
 
+def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads=128):
+    run_gemm_rs(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k, num_threads=num_threads)
 
-def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_rs_false_false(m, n, k, num_threads=128):
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_rs_true_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_rs_true_false(m, n, k, num_threads=128):
+    run_gemm_rs(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_rs_true_true(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_rs_true_true(m, n, k, num_threads=128):
+    run_gemm_rs(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
-    run_gemm_sr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
 
+def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads=128):
+    run_gemm_sr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k, num_threads=num_threads)
 
-def run_gemm_sr_false_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_sr_false_false(m, n, k, num_threads=128):
+    run_gemm_sr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_sr_true_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_sr_true_false(m, n, k, num_threads=128):
+    run_gemm_sr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_sr_true_true(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_sr_true_true(m, n, k, num_threads=128):
+    run_gemm_sr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
-    run_gemm_rr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
 
+def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads=128):
+    run_gemm_rr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k, num_threads=num_threads)
 
-def run_gemm_rr_false_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_rr_false_false(m, n, k, num_threads=128):
+    run_gemm_rr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_rr_true_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
 
+def run_gemm_rr_true_false(m, n, k, num_threads=128):
+    run_gemm_rr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
-def run_gemm_rr_true_true(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+
+def run_gemm_rr_true_true(m, n, k, num_threads=128):
+    run_gemm_rr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, num_threads=num_threads)
 
 
 TRANS_CASES = [
@@ -499,10 +525,12 @@ def _setup_tilelang_environment():
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
-@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
-def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype,num_threads", FALSE_TRUE_CASES)
+def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads):
     import torch
 
+    _skip_if_threads_exceed_n(num_threads, n)
+
     required_torch_attrs = {
         in_dtype,
         out_dtype,
@@ -523,171 +551,216 @@ def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
         m,
         n,
         k,
+        num_threads=num_threads,
     )
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_false_false(m, n, k):
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_false_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     run_gemm(
         m,
         n,
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
+        num_threads=num_threads,
     )
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_true_false(m, n, k):
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_true_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     run_gemm(
         m,
         n,
         k * 3,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
+        num_threads=num_threads,
     )
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_true_true(m, n, k):
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_true_true(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     run_gemm(
         m,
         n,
         k * 3,
         True,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
+        num_threads=num_threads,
     )
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
-@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
-def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype,num_threads", FALSE_TRUE_CASES)
+def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
-    run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+    run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rs_false_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rs_false_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_false_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rs_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rs_true_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rs_true_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_true_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rs_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rs_true_true(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rs_true_true(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_true_true(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
-@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
-def test_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype,num_threads", FALSE_TRUE_CASES)
+def test_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
-    run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+    run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_sr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_sr_false_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_sr_false_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_false_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_sr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_sr_true_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_sr_true_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_true_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_sr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_sr_true_true(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_sr_true_true(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_true_true(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
-@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
-def test_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype,num_threads", FALSE_TRUE_CASES)
+def test_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
     _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
-    run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+    run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rr_false_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rr_false_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_false_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rr_true_false(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rr_true_false(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_true_false(m, n, k, num_threads)
 
 
 @pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
-def test_gemm_rr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
-    run_gemm_rr_true_true(m, n, k)
+@pytest.mark.parametrize("num_threads", NUM_THREADS_VALUES, ids=lambda v: f"threads{v}")
+def test_gemm_rr_true_true(m, n, k, num_threads):
+    _skip_if_threads_exceed_n(num_threads, n)
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_true_true(m, n, k, num_threads)
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    run_gemm(
+        M=64,
+        N=192,
+        K=64,
+        trans_A=False,
+        trans_B=False,
+        in_dtype=T.bfloat16,
+        out_dtype=T.bfloat16,
+        dtypeAccum=T.float32,
+        block_M=64,
+        block_N=192,
+        block_K=64,
+        num_stages=0,
+        num_threads=256,
+    )
 
     # # Test Pass
     # for m in [64, 128, 256]:
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -695,7 +768,7 @@ def test_gemm_rr_true_true(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -703,7 +776,7 @@ def test_gemm_rr_true_true(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True False =============================")
-    #             run_gemm(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -712,7 +785,7 @@ def test_gemm_rr_true_true(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True True =============================")
-    #             run_gemm(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -721,15 +794,15 @@ def test_gemm_rr_true_true(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm_rs(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm_rs(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm_rs(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm_rs(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_sm70.py b/maint/gemm_v2/correctness_evaluation_sm70.py
index 128f4abce..606d10261 100644
--- a/maint/gemm_v2/correctness_evaluation_sm70.py
+++ b/maint/gemm_v2/correctness_evaluation_sm70.py
@@ -2,6 +2,7 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 
 
 def matmul(
@@ -24,13 +25,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -67,7 +66,8 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -80,7 +80,7 @@ def ref_program(A, B):
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -146,13 +146,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -213,23 +211,25 @@ def run_gemm_rs(
 M_VALUES = [64, 128]
 N_VALUES = [32, 64, 128]
 K_VALUES = [16, 32, 64]
-FALSE_TRUE_CASES = ([
+FALSE_TRUE_CASES = [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
+    )
+    for k in K_VALUES
 ] + [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         id=f"K{k}-float16-float16-float32",
-    ) for k in K_VALUES
-])
+    )
+    for k in K_VALUES
+]
 
 
 def _ensure_torch_dtypes(*dtype_names):
@@ -245,7 +245,7 @@ def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
 
 
 TRANS_CASES = [
@@ -303,9 +303,9 @@ def test_gemm_false_false(m, n, k):
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -326,7 +326,7 @@ def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_false_false(m, n, k)
 
 
@@ -338,7 +338,7 @@ def test_gemm_rs_false_false(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -346,5 +346,5 @@ def test_gemm_rs_false_false(m, n, k):
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
new file mode 100644
index 000000000..1a582af28
--- /dev/null
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -0,0 +1,231 @@
+# pytest correctness_evaluation.py -n 32
+import pytest
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def _compile_and_check(
+    program,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+):
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+    print(kernel.get_kernel_source())
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+    print("assert_allclose")
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+M_VALUES = [32, 64, 128, 256]
+N_VALUES = [64, 128, 256, 512]
+K_VALUES = [16, 32, 64, 128]
+K_VALUES_8Bit = [32, 64, 128]
+FALSE_TRUE_CASES = (
+    [
+        pytest.param(
+            k,
+            T.float16,
+            T.float32,
+            T.float32,
+            id=f"K{k}-float16-float-float",
+        )
+        for k in K_VALUES
+    ]
+    + [
+        pytest.param(
+            k,
+            T.float8_e5m2,
+            T.float32,
+            T.float32,
+            id="K32-float8_e5m2-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            T.int8,
+            T.int32,
+            T.int32,
+            id="K32-int8-int32-int32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+)
+
+TRANS_CASES = [
+    pytest.param(False, True, id="nt"),
+]
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    import torch
+
+    required_torch_attrs = {
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+    }
+    for attr in required_torch_attrs:
+        if not hasattr(torch, attr):
+            pytest.skip(f"Torch does not expose dtype {attr}")
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        True,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        m,
+        n,
+        k,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 256)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float8_e5m2, T.float, T.float, m, n, k, 2, 128)
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
index 07a502017..b7b2a2af9 100644
--- a/maint/gemm_v2/latency.py
+++ b/maint/gemm_v2/latency.py
@@ -13,13 +13,12 @@
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_gemm.py b/maint/gemm_v2/latency_gemm.py
index 13392dec7..5f0450e02 100644
--- a/maint/gemm_v2/latency_gemm.py
+++ b/maint/gemm_v2/latency_gemm.py
@@ -13,13 +13,12 @@
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_mha_fwd_bhsd.py b/maint/gemm_v2/latency_mha_fwd_bhsd.py
index 4126bb9d3..7a83d7cec 100644
--- a/maint/gemm_v2/latency_mha_fwd_bhsd.py
+++ b/maint/gemm_v2/latency_mha_fwd_bhsd.py
@@ -8,13 +8,13 @@
 from functools import partial
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--batch', type=int, default=128, help='batch size')
-parser.add_argument('--heads', type=int, default=16, help='heads')
-parser.add_argument('--seq_q', type=int, default=1024, help='query sequence length')
-parser.add_argument('--seq_kv', type=int, default=1024, help='key/value sequence length')
-parser.add_argument('--dim', type=int, default=256, help='dim')
-parser.add_argument('--is_causal', action='store_true', help='causal')
-parser.add_argument('--tune', action='store_true', help='tune configs')
+parser.add_argument("--batch", type=int, default=128, help="batch size")
+parser.add_argument("--heads", type=int, default=16, help="heads")
+parser.add_argument("--seq_q", type=int, default=1024, help="query sequence length")
+parser.add_argument("--seq_kv", type=int, default=1024, help="key/value sequence length")
+parser.add_argument("--dim", type=int, default=256, help="dim")
+parser.add_argument("--is_causal", action="store_true", help="causal")
+parser.add_argument("--tune", action="store_true", help="tune configs")
 parser.add_argument("--use_v2", action="store_true")
 
 args = parser.parse_args()
@@ -29,24 +29,17 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -62,7 +55,7 @@ def MMA0(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
@@ -85,7 +78,7 @@ def MMA1(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         # T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
         if use_v2:
             T.gemm_v2(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
@@ -94,13 +87,13 @@ def MMA1(
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -125,18 +118,18 @@ def Softmax(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -152,43 +145,42 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -206,18 +198,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=0,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128)
         print(kernel.get_kernel_source())
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
diff --git a/maint/host_checks/01_num_args_mismatch.py b/maint/host_checks/01_num_args_mismatch.py
new file mode 100644
index 000000000..9528652ee
--- /dev/null
+++ b/maint/host_checks/01_num_args_mismatch.py
@@ -0,0 +1,22 @@
+"""Reproduce: Argument count mismatch.
+
+Note: The adapter-level wrapper expects only inputs (A, B) because C is marked as output.
+Calling with the wrong number of inputs raises a ValueError before host entry.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    # Missing b
+    # Expected: ValueError with message about expected vs. actual inputs
+    fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/02_pointer_type_error.py b/maint/host_checks/02_pointer_type_error.py
new file mode 100644
index 000000000..188a4f8cc
--- /dev/null
+++ b/maint/host_checks/02_pointer_type_error.py
@@ -0,0 +1,23 @@
+"""Reproduce: Pointer-type argument expected but scalar provided.
+
+We pass an integer for A; wrapper forwards it to the host where a pointer is expected.
+Expected: error like "Expect buffer A_handle to be pointer or tensor" (exact name depends on kernel param).
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # Wrong type for A (int instead of tensor)
+    a = 1
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/03_ndim_mismatch.py b/maint/host_checks/03_ndim_mismatch.py
new file mode 100644
index 000000000..76637e8de
--- /dev/null
+++ b/maint/host_checks/03_ndim_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: ndim (rank) mismatch for A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A has rank 3 instead of 2
+    a = torch.empty((M, K, 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/04_dtype_mismatch.py b/maint/host_checks/04_dtype_mismatch.py
new file mode 100644
index 000000000..f3554c1d6
--- /dev/null
+++ b/maint/host_checks/04_dtype_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: dtype mismatch for A (float32 vs expected float16)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+    print(fn.get_host_source())
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float32)  # should be float16
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/05_shape_mismatch.py b/maint/host_checks/05_shape_mismatch.py
new file mode 100644
index 000000000..a48248176
--- /dev/null
+++ b/maint/host_checks/05_shape_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: shape constant/symbol mismatch on A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A's second dimension is wrong (K+1 instead of K)
+    a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/06_strides_mismatch.py b/maint/host_checks/06_strides_mismatch.py
new file mode 100644
index 000000000..7e523cd64
--- /dev/null
+++ b/maint/host_checks/06_strides_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: strides check failure (non-contiguous A via transpose)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    a_nc = a.t()  # non-contiguous after transpose
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a_nc, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/07_device_type_mismatch.py b/maint/host_checks/07_device_type_mismatch.py
new file mode 100644
index 000000000..af8e5efd5
--- /dev/null
+++ b/maint/host_checks/07_device_type_mismatch.py
@@ -0,0 +1,18 @@
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cpu", dtype=torch.float16)
+    b = torch.empty((K, N), device="cpu", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/08_device_id_mismatch.py b/maint/host_checks/08_device_id_mismatch.py
new file mode 100644
index 000000000..280aca157
--- /dev/null
+++ b/maint/host_checks/08_device_id_mismatch.py
@@ -0,0 +1,25 @@
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if torch.cuda.device_count() < 2:
+        print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
+        return
+
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
+    # Output device is derived by the adapter; mismatch occurs in host checks
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/09_null_data_pointer.py b/maint/host_checks/09_null_data_pointer.py
new file mode 100644
index 000000000..09f5de1af
--- /dev/null
+++ b/maint/host_checks/09_null_data_pointer.py
@@ -0,0 +1,26 @@
+"""Reproduce: NULL data pointer (advanced).
+
+Passing None for a tensor argument will be forwarded through the adapter. Depending on
+FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
+or a host-side non-NULL pointer check.
+
+Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
+demonstrates passing None, which still reproduces the intended class of failure.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = None  # attempt to pass a null-like pointer
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/10_scalar_type_mismatch.py b/maint/host_checks/10_scalar_type_mismatch.py
new file mode 100644
index 000000000..4f2c90b8d
--- /dev/null
+++ b/maint/host_checks/10_scalar_type_mismatch.py
@@ -0,0 +1,15 @@
+"""Reproduce: scalar parameter type mismatch (int/bool)."""
+
+from common import build_scalar_check_kernel
+
+
+def main():
+    fn = build_scalar_check_kernel(target="cuda")
+
+    # Wrong types
+    fn(1.0, True)  # x should be int -> Expect arg[0] to be int
+    fn(1, 2.5)  # flag should be bool -> Expect arg[1] to be boolean
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/README.md b/maint/host_checks/README.md
new file mode 100644
index 000000000..ac23d6fd2
--- /dev/null
+++ b/maint/host_checks/README.md
@@ -0,0 +1,21 @@
+# Host-Side Check Repro Scripts
+
+This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
+
+Prerequisites
+- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
+- Python packages: torch, tilelang
+
+Usage
+- Run any script, e.g.:
+  - `python 01_num_args_mismatch.py`
+  - `python 02_pointer_type_error.py`
+  - ... up to `10_scalar_type_mismatch.py`
+
+- Or run all at once with a summary:
+  - `python run_all.py`
+  - Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
+
+Notes
+- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
+- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
new file mode 100644
index 000000000..3dbac5481
--- /dev/null
+++ b/maint/host_checks/common.py
@@ -0,0 +1,41 @@
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
+    """Compile and return a callable kernel that takes (A, B) and returns C."""
+    if target.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
+    prim = make_matmul_prim(M, N, K)
+    # out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
+    return tilelang.compile(prim, out_idx=[2], target=target)
+
+
+def build_scalar_check_kernel(target="cuda"):
+    @T.prim_func
+    def scalar_check(x: T.int32, flag: T.bool()):
+        T.evaluate(0)
+
+    return tilelang.compile(scalar_check, target=target)
diff --git a/maint/host_checks/run_all.py b/maint/host_checks/run_all.py
new file mode 100644
index 000000000..7fecd8b18
--- /dev/null
+++ b/maint/host_checks/run_all.py
@@ -0,0 +1,71 @@
+import sys
+import subprocess
+from pathlib import Path
+
+
+def main():
+    root = Path(__file__).resolve().parent
+    scripts = [
+        "01_num_args_mismatch.py",
+        "02_pointer_type_error.py",
+        "03_ndim_mismatch.py",
+        "04_dtype_mismatch.py",
+        "05_shape_mismatch.py",
+        "06_strides_mismatch.py",
+        "07_device_type_mismatch.py",
+        "08_device_id_mismatch.py",
+        "09_null_data_pointer.py",
+        "10_scalar_type_mismatch.py",
+    ]
+
+    logs_dir = root / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    results = []
+    for name in scripts:
+        script_path = root / name
+        if not script_path.exists():
+            results.append((name, "MISSING", 0))
+            print(f"[MISSING] {name}")
+            continue
+
+        print(f"\n=== Running {name} ===")
+        proc = subprocess.run(
+            [sys.executable, str(script_path)],
+            cwd=str(root),
+            capture_output=True,
+            text=True,
+        )
+
+        # Save logs
+        (logs_dir / f"{name}.out").write_text(proc.stdout)
+        (logs_dir / f"{name}.err").write_text(proc.stderr)
+
+        out = (proc.stdout or "") + (proc.stderr or "")
+        if "[SKIP]" in out:
+            status = "SKIP"
+        elif proc.returncode != 0:
+            status = "PASS"  # error reproduced as expected
+        else:
+            status = "FAIL"  # no error observed
+
+        results.append((name, status, proc.returncode))
+        print(f"[{status}] {name} (rc={proc.returncode})")
+
+    # Summary
+    print("\n=== Summary ===")
+    counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
+    for name, status, _ in results:
+        counts[status] = counts.get(status, 0) + 1
+        print(f"{status:7} {name}")
+
+    print("\nTotals:")
+    for k in ("PASS", "FAIL", "SKIP", "MISSING"):
+        print(f"  {k:7}: {counts.get(k, 0)}")
+
+    # Exit non-zero if any FAIL
+    sys.exit(1 if counts.get("FAIL", 0) else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
old mode 100644
new mode 100755
index 7d0d67db7..c77a67cfd
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -37,7 +37,7 @@
     6: "sqrt",
     7: "tanh",
     8: "rsqrt",
-    9: "inv_sqrt"
+    9: "inv_sqrt",
 }
 
 # Block sizes for kernels
@@ -49,8 +49,7 @@
 
 def parse_arguments() -> argparse.Namespace:
     """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Precision comparison tool for various CUDA implementations")
+    parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
     parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
     parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
     parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
     return load(
         name="cuda_ops",
         sources=["cuda_ops.cu"],
-        extra_cuda_cflags=[]  # No fast_math flags
+        extra_cuda_cflags=[],  # No fast_math flags
     )
 
 
@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S
 
 
 @triton.jit
-def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
-                                  BLOCK_SIZE: tl.constexpr):
+def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     """LibDevice Triton kernel for unary operations."""
     pid = tl.program_id(0)
     block_start = pid * BLOCK_SIZE
@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
 
     @T.prim_func
     def tilelang_unary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):
 
     @T.prim_func
     def tilelang_binary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
-            C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -247,10 +239,7 @@ def tilelang_binary_kernel(
     return tilelang_binary_kernel
 
 
-def tilelang_op(x: torch.Tensor,
-                op_id: int,
-                y: Optional[torch.Tensor] = None,
-                use_fastmath: bool = False) -> torch.Tensor:
+def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
     """TileLang operation interface."""
     assert x.is_cuda
 
@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x, y)
     else:  # Unary operation
         kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x)
 
     # Restore original shape
@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     """Standard Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     return out
 
 
-def triton_libdevice_op(x: torch.Tensor,
-                        op_id: int,
-                        y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """LibDevice Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
     return out
 
 
-def get_pytorch_reference(x: torch.Tensor,
-                          op_id: int,
-                          y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Get PyTorch reference implementation for the given operation."""
     if op_id == 0:
         assert y is not None, "Division requires second operand"
@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T
 
     abs_err = (output_double - reference_double).abs()
     rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
-    print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
-          f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
+    print(
+        f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
+        f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
+    )
 
 
 # Precision comparison function
@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
             results[name] = None
 
     # Print comparison header
-    print(
-        f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
-    )
+    print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
     print("-" * 90)
 
     # Compare all implementations against double precision reference
@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
         summarize_error(tag, output, ref_double)
 
 
-def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
-                       high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Generate appropriate test data for each operation."""
     if op_id == 0:  # Division
         x = torch.empty(n, device=device).uniform_(low, high)
@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
 
 def main() -> None:
     """Main execution function."""
-    print(
-        "Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
-    )
+    print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
     print("=" * 90)
 
     for op_id in range(len(OP_NAMES)):
diff --git a/maint/precision/cuda_ops.cu b/maint/precision/cuda_ops.cu
index 519335751..1f37d53de 100644
--- a/maint/precision/cuda_ops.cu
+++ b/maint/precision/cuda_ops.cu
@@ -239,4 +239,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
     m.def("launch_fast_operator", &launch_fast_operator, "CUDA Fast Operator",
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
-}
\ No newline at end of file
+}
diff --git a/maint/scripts/apply_mit_license.sh b/maint/scripts/apply_mit_license.sh
index cc425b964..2bb7cc946 100755
--- a/maint/scripts/apply_mit_license.sh
+++ b/maint/scripts/apply_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Add MIT license boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # TO source code root
@@ -17,8 +19,8 @@ done
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cmake' \
     -or -name '*.py' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
-    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then       
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
+    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         cat maint/scripts/mit_liscense2.txt ${SRC_FILE} > ${SRC_FILE}.new
         mv ${SRC_FILE}.new ${SRC_FILE}
     fi
@@ -26,7 +28,7 @@ done
 
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -name '*.sh' ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
     if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         line=$(head -n 1 ${SRC_FILE})
         if [[ $line == "#!/bin/bash"* ]]; then
diff --git a/maint/scripts/build_docs.sh b/maint/scripts/build_docs.sh
index f367dcc70..3119eb8c7 100755
--- a/maint/scripts/build_docs.sh
+++ b/maint/scripts/build_docs.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python -m venv .venv
 source .venv/bin/activate
 python -m pip install --upgrade pip --no-user
diff --git a/maint/scripts/check_mit_license.sh b/maint/scripts/check_mit_license.sh
index 855c48f4c..3802b1efa 100755
--- a/maint/scripts/check_mit_license.sh
+++ b/maint/scripts/check_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Check MIT License boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # To source code root
@@ -8,7 +10,7 @@ EXITCODE=0
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name '*apply_mit_license.sh' \
     -not -name '*check_mit_license.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cpp' -or -name '*.cu' -or -name '*.h'  -or -name '*.hpp' \
     -or -name '*.py' -or -name '*.sh' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    
+
     # Skip files that already contain the Apache License
     if grep -q "Apache License" "${SRC_FILE}"; then
         continue
diff --git a/maint/scripts/ci_performance.py b/maint/scripts/ci_performance.py
deleted file mode 100644
index 998e7b650..000000000
--- a/maint/scripts/ci_performance.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import subprocess
-import re
-from tabulate import tabulate
-
-import os
-
-env = os.environ.copy()
-env["TILELANG_CLEAR_CACHE"] = "1"
-
-
-def parse_output(output):
-    data = {}
-    for line in output.split('\n'):
-        line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
-    return data
-
-
-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
-data_v1 = parse_output(output_v1)
-
-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
-data_v2 = parse_output(output_v2)
-
-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
-
-headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
-
-print(tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
diff --git a/maint/scripts/docker_local_distribute.sh b/maint/scripts/docker_local_distribute.sh
index 02dbc19bd..2263066dc 100755
--- a/maint/scripts/docker_local_distribute.sh
+++ b/maint/scripts/docker_local_distribute.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail
 
 # Build for local architecture
-CIBW_BUILD='cp38-*' cibuildwheel .
+CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/docker_pypi_distribute.sh b/maint/scripts/docker_pypi_distribute.sh
index aa9ed9ab2..f8d746de1 100755
--- a/maint/scripts/docker_pypi_distribute.sh
+++ b/maint/scripts/docker_pypi_distribute.sh
@@ -12,9 +12,8 @@ if docker buildx version >/dev/null 2>&1; then
     docker buildx use multi >/dev/null 2>&1 || true
   fi
   docker buildx inspect --bootstrap >/dev/null 2>&1 || true
-  done
 
   export CIBW_ARCHS='x86_64 aarch64'
 fi
 
-NO_VERSION_LABEL=ON CIBW_BUILD='cp38-*' cibuildwheel .
+NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/local_distribution.sh b/maint/scripts/local_distribution.sh
index ff8239dff..d3b137fb4 100755
--- a/maint/scripts/local_distribution.sh
+++ b/maint/scripts/local_distribution.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 set -eux
 
 rm -rf dist
diff --git a/maint/scripts/performance.py b/maint/scripts/performance.py
deleted file mode 100644
index 24c4a21e8..000000000
--- a/maint/scripts/performance.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import argparse
-import tilelang.language as T
-from tilelang.autotuner import AutoTuner
-
-
-def ref_program(A, B):
-    return A @ B.T
-
-
-def get_configs():
-    configs = [{
-        "block_M": 128,
-        "block_N": 128,
-        "block_K": 64,
-        "num_stages": 2,
-        "thread_num": 256,
-        "enable_rasteration": True,  # keep param name for backward-compat
-    }]
-    return configs
-
-
-def run(M, N, K):
-
-    def kernel(
-        block_M=None,
-        block_N=None,
-        block_K=None,
-        num_stages=None,
-        thread_num=None,
-        enable_rasteration=None,
-    ):
-        dtype = "float16"
-        accum_dtype = "float"
-
-        @T.prim_func
-        def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
-        ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-                A_shared = T.alloc_shared((block_M, block_K), dtype)
-                B_shared = T.alloc_shared((block_N, block_K), dtype)
-                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-                C_shared = T.alloc_shared((block_M, block_N), dtype)
-                T.use_swizzle(panel_size=10, enable=enable_rasteration)
-                T.clear(C_local)
-                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.gemm(
-                        A_shared,
-                        B_shared,
-                        C_local,
-                        transpose_B=True,
-                    )
-                T.copy(C_local, C_shared)
-                T.copy(C_shared, C[by * block_M, bx * block_N])
-
-        return main
-
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs()).set_compile_args(
-            out_idx=[-1],
-            target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
-    return autotuner.run(warmup=3, rep=20)
-
-
-if __name__ == "__main__":
-    # Parse command-line arguments for matrix dimensions
-    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    args = parser.parse_args()
-
-    M, N, K = args.m, args.n, args.k
-
-    # Compute total floating-point operations to measure throughput
-    total_flops = 2 * M * N * K
-
-    result = run(M, N, K)
-
-    print(f"Latency: {result.latency}")
-    print(f"TFlops: {total_flops / result.latency * 1e-9:.3f}")
-    print(f"Config: {result.config}")
-
-    print(f"Reference TFlops: {total_flops / result.ref_latency * 1e-9:.3f}")
diff --git a/maint/scripts/pypi.manylinux.Dockerfile b/maint/scripts/pypi.manylinux.Dockerfile
index 5ca694124..de45df02e 100644
--- a/maint/scripts/pypi.manylinux.Dockerfile
+++ b/maint/scripts/pypi.manylinux.Dockerfile
@@ -1,8 +1,8 @@
-FROM quay.io/pypa/manylinux2014_x86_64 AS builder_amd64
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64
 
-RUN yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.8
 ENV CUDA_VERSION=${CUDA_VERSION}
 
 FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64
diff --git a/maint/scripts/pypi_distribution.sh b/maint/scripts/pypi_distribution.sh
index 5a0865141..9a8c6e62c 100755
--- a/maint/scripts/pypi_distribution.sh
+++ b/maint/scripts/pypi_distribution.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 set -eux
 
 rm -rf dist raw_dist
diff --git a/maint/scripts/pytest_cuda_scheduler.py b/maint/scripts/pytest_cuda_scheduler.py
new file mode 100644
index 000000000..c60dc7ac4
--- /dev/null
+++ b/maint/scripts/pytest_cuda_scheduler.py
@@ -0,0 +1,242 @@
+# ruff: noqa
+"""CUDA-aware pytest-xdist scheduler.
+
+This plugin ensures that each xdist worker is pinned to a unique CUDA device.
+The controller process assigns devices ahead of worker start-up so each worker
+sees its dedicated GPU via ``CUDA_VISIBLE_DEVICES``.
+"""
+
+from __future__ import annotations
+
+import os
+from collections import deque
+from typing import Any, Deque
+from collections.abc import Iterable, MutableMapping
+
+import pytest
+from _pytest.config import Config
+from _pytest.stash import StashKey
+from xdist.scheduler.load import LoadScheduling
+
+ENV_DEVICE_LIST = "PYTEST_XDIST_CUDA_DEVICES"
+ENV_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+WORKER_ENV_DEVICE = "PYTEST_XDIST_WORKER_CUDA_DEVICE"
+# Optional override: how many workers per selected GPU.
+ENV_WORKERS_PER_DEVICE = "PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE"
+
+ALLOCATOR_KEY: StashKey[CudaDeviceAllocator] = StashKey()
+NUM_WORKERS_PER_DEVICE = 4  # default if ENV_WORKERS_PER_DEVICE is unset/invalid
+
+
+def _workers_per_device() -> int:
+    """Return number of workers per GPU from env or default."""
+    raw = os.environ.get(ENV_WORKERS_PER_DEVICE)
+    if not raw:
+        return NUM_WORKERS_PER_DEVICE
+    try:
+        val = int(str(raw).strip())
+        if val > 0:
+            return val
+    except Exception:
+        pass
+    return NUM_WORKERS_PER_DEVICE
+
+
+class CudaDeviceAllocator:
+    """Allocate one CUDA device per execnet spec/worker."""
+
+    def __init__(self, devices: Iterable[str]) -> None:
+        # cleaned = [str(device).strip() for device in devices if str(device).strip()]
+        cleaned = []
+        workers_per_device = _workers_per_device()
+        for device in devices:
+            dev_str = str(device).strip()
+            if dev_str:
+                cleaned += [dev_str] * workers_per_device
+
+        self._total = len(cleaned)
+        self._queue: Deque[str] = deque(cleaned)
+        self._assignments: dict[str, str] = {}
+
+    @property
+    def count(self) -> int:
+        return self._total
+
+    @property
+    def assigned(self) -> dict[str, str]:
+        return dict(self._assignments)
+
+    def assign(self, spec: object) -> str:
+        key = self._spec_key(spec)
+        if key in self._assignments:
+            return self._assignments[key]
+        if not self._queue:
+            raise pytest.UsageError(
+                f"Not enough CUDA devices for pytest-xdist workers; need at least {len(self._assignments) + 1}, available {self._total}."
+            )
+        device = self._queue.popleft()
+        self._assignments[key] = device
+        return device
+
+    @staticmethod
+    def _spec_key(spec: object) -> str:
+        identifier = getattr(spec, "id", None)
+        if identifier is None:
+            identifier = repr(spec)
+        return str(identifier)
+
+
+class CudaDeviceScheduler(LoadScheduling):
+    """Load scheduler that records device assignments for logging."""
+
+    def __init__(self, config: Config, log, allocator: CudaDeviceAllocator) -> None:
+        super().__init__(config, log)
+        self._allocator = allocator
+        if allocator.count < self.numnodes:
+            raise pytest.UsageError(f"Not enough CUDA devices for pytest-xdist workers: need {self.numnodes}, found {allocator.count}.")
+
+    def add_node(self, node) -> None:  # WorkerController (runtime import avoidance)
+        device = self._allocator.assign(node.gateway.spec)
+        node.workerinput.setdefault("cuda_device", device)
+        self.log("assigned", node.gateway.id, "to CUDA device", device)
+        super().add_node(node)
+
+
+def _parse_device_list(raw: str) -> list[str]:
+    return [token.strip() for token in raw.split(",") if token.strip()]
+
+
+def _discover_cuda_devices() -> list[str]:
+    """
+    Discover available CUDA devices, then select only ceil(n/2) devices.
+
+    - Priority of discovery:
+      1) ENV: PYTEST_XDIST_CUDA_DEVICES
+      2) ENV: CUDA_VISIBLE_DEVICES
+      3) torch.cuda device count (if available)
+
+    After discovery, if there are n devices, we keep ceil(n/2) devices.
+    """
+    devices: list[str] = []
+
+    primary = os.environ.get(ENV_DEVICE_LIST)
+    if primary:
+        devices = _parse_device_list(primary)
+    else:
+        visible = os.environ.get(ENV_VISIBLE_DEVICES)
+        if visible:
+            devices = _parse_device_list(visible)
+        else:
+            try:  # Lazy import; torch is optional.
+                import torch  # type: ignore
+            except Exception:
+                torch = None  # type: ignore
+            if torch and torch.cuda.is_available():  # type: ignore[truthy-function]
+                devices = [str(idx) for idx in range(torch.cuda.device_count())]
+
+    if devices:
+        # Keep only ceil(n/2) devices
+        n = len(devices)
+        limit = (n + 1) // 2
+        devices = devices[:limit]
+
+    return devices
+
+
+def _xdist_enabled(config: Config) -> bool:
+    try:
+        dist_mode = config.getoption("dist")
+        tx_specs = config.getoption("tx")
+    except (AttributeError, ValueError):
+        return False
+    return dist_mode != "no" and bool(tx_specs)
+
+
+def _ensure_allocator(config: Config) -> CudaDeviceAllocator:
+    try:
+        return config.stash[ALLOCATOR_KEY]
+    except KeyError:
+        devices = _discover_cuda_devices()
+        if not devices:
+            raise pytest.UsageError(f"Cannot auto-discover CUDA devices. Set CUDA_VISIBLE_DEVICES or {ENV_DEVICE_LIST}.")
+        allocator = CudaDeviceAllocator(devices)
+        config.stash[ALLOCATOR_KEY] = allocator
+        return allocator
+
+
+@pytest.hookimpl
+def pytest_configure(config: Config) -> None:
+    if getattr(config, "workerinput", None) is not None:
+        device = config.workerinput.get("cuda_device")  # type: ignore[arg-type]
+        if device:
+            os.environ[ENV_VISIBLE_DEVICES] = str(device)
+            os.environ[WORKER_ENV_DEVICE] = str(device)
+        _set_worker_process_title(config, device, None)
+        return
+    if not _xdist_enabled(config):
+        return
+    _ensure_allocator(config)
+
+
+@pytest.hookimpl
+def pytest_xdist_setupnodes(config: Config, specs: Iterable[Any]) -> None:
+    if not _xdist_enabled(config):
+        return
+    allocator = _ensure_allocator(config)
+    for spec in specs:
+        device = allocator.assign(spec)
+        env: MutableMapping[str, str]
+        env = getattr(spec, "env", None) or {}
+        env = dict(env)
+        env[ENV_VISIBLE_DEVICES] = device
+        env[WORKER_ENV_DEVICE] = device
+        spec.env = env  # type: ignore[attr-defined]
+
+
+@pytest.hookimpl
+def pytest_configure_node(node) -> None:  # WorkerController
+    allocator = _ensure_allocator(node.config)
+    device = allocator.assign(node.gateway.spec)
+    node.workerinput["cuda_device"] = device
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_xdist_make_scheduler(config: Config, log):
+    if getattr(config, "workerinput", None) is not None:
+        return None
+    if not _xdist_enabled(config):
+        return None
+    allocator = _ensure_allocator(config)
+    return CudaDeviceScheduler(config, log, allocator)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_protocol(item, nextitem):
+    # Only act on workers; controller has no workerinput/device.
+    if getattr(item.config, "workerinput", None) is not None:
+        device = os.environ.get(WORKER_ENV_DEVICE)
+        _set_worker_process_title(item.config, device, item.nodeid)
+    return None
+
+
+def _set_worker_process_title(config: Config, device: str | None, test_name: str | None) -> None:
+    """Optionally label worker processes for easier inspection."""
+    try:
+        import setproctitle
+    except Exception:
+        return
+
+    workerid = None
+    workerinput = getattr(config, "workerinput", None)
+    if isinstance(workerinput, dict):
+        workerid = workerinput.get("workerid")
+
+    title_parts = ["pytest-xdist-worker"]
+    if workerid:
+        title_parts.append(str(workerid))
+    if device:
+        title_parts.append(f"cuda{device}")
+    if test_name:
+        title_parts.append(test_name)
+
+    setproctitle.setproctitle(" ".join(title_parts))
diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
new file mode 100644
index 000000000..d6919cfea
--- /dev/null
+++ b/maint/scripts/regression_all.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    from tabulate import tabulate
+except Exception:  # pragma: no cover
+    tabulate = None  # type: ignore
+
+
+@dataclass(frozen=True)
+class PerfResult:
+    name: str
+    latency: float
+
+
+_RESULTS: list[PerfResult] = []
+
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def _parse_table(output: str) -> dict[str, float]:
+    # Prefer a single JSON marker line if present.
+    for line in reversed(output.splitlines()):
+        if line.startswith(_RESULTS_JSON_PREFIX):
+            payload = line[len(_RESULTS_JSON_PREFIX) :].strip()
+            items = json.loads(payload)
+            data: dict[str, float] = {}
+            for item in items:
+                name = str(item["name"]).strip()
+                latency = float(item["latency"])
+                data[name] = latency
+            return data
+
+    # Backward-compatible text parsing (best-effort).
+    data = {}
+    for line in output.splitlines():
+        line = line.strip()
+        if not line or ":" not in line:
+            continue
+        name, _, val = line.partition(":")
+        name = name.strip()
+        val = val.strip()
+        if not name:
+            continue
+        try:
+            data[name] = float(val)
+        except ValueError:
+            # Ignore unrelated prints/logs.
+            continue
+    return data
+
+
+def _examples_root() -> Path:
+    # repo_root/tilelang/testing/perf_regression.py -> repo_root
+    return Path(__file__).resolve().parents[2] / "examples"
+
+
+def _discover_bench_files(examples_root: Path) -> list[Path]:
+    patterns = ("regression_*.py",)
+    files: list[Path] = []
+    for pat in patterns:
+        files.extend(examples_root.rglob(pat))
+    # Avoid picking up things like __pycache__ etc.
+    return sorted({p for p in files if p.is_file() and p.name != "__init__.py"})
+
+
+def regression_all(examples_root: str | os.PathLike[str] | None = None) -> None:
+    """Run all example benchmark drivers and print a consolidated table.
+
+    Intended usage (CI): `python -c "import tilelang.testing.perf_regression as pr; pr.regression_all()"`
+    """
+
+    root = Path(examples_root) if examples_root is not None else _examples_root()
+    if not root.exists():
+        raise FileNotFoundError(f"Examples root not found: {root}")
+
+    bench_files = _discover_bench_files(root)
+    if not bench_files:
+        raise RuntimeError(f"No drivers found under: {root}")
+
+    merged: dict[str, float] = {}
+    failures: list[str] = []
+
+    total = len(bench_files)
+    print(f"\n{'═' * 60}")
+    print("  TileLang Performance Regression Suite")
+    print(f"  Found {total} test file(s)")
+    print(f"{'═' * 60}")
+    for idx, bench_file in enumerate(bench_files, 1):
+        rel_path = bench_file.relative_to(root)
+        print(f"\n{'─' * 60}")
+        print(f"[{idx}/{total}] 📂 {rel_path}")
+        print(f"{'─' * 60}")
+
+        proc = subprocess.Popen(
+            [sys.executable, str(bench_file)],
+            cwd=str(bench_file.parent),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env={
+                **os.environ,
+                "PYTHONNOUSERSITE": "1",
+                "TL_PERF_REGRESSION_FORMAT": "json",
+            },
+        )
+
+        stdout_lines: list[str] = []
+        # Stream stdout in real-time
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            stdout_lines.append(line)
+            # Don't print the JSON result line
+            if not line.startswith(_RESULTS_JSON_PREFIX):
+                print(line, end="", flush=True)
+
+        proc.wait()
+        stdout_content = "".join(stdout_lines)
+        stderr_content = proc.stderr.read() if proc.stderr else ""
+
+        if proc.returncode != 0:
+            failures.append(f"{rel_path}\nSTDOUT:\n{stdout_content}\nSTDERR:\n{stderr_content}")
+            print("  └─ ❌ FAILED")
+            continue
+
+        parsed = _parse_table(stdout_content)
+        num_tests = len(parsed)
+        for k, v in parsed.items():
+            if k not in merged:
+                merged[k] = v
+                _RESULTS.append(PerfResult(name=k, latency=v))
+
+        print(f"  └─ ✅ Completed ({num_tests} tests)")
+
+    # Print summary
+    print(f"\n{'═' * 60}")
+    print("  Summary")
+    print(f"{'═' * 60}")
+    passed = total - len(failures)
+    print(f"  ✅ Passed: {passed}/{total} files")
+    if failures:
+        print(f"  ❌ Failed: {len(failures)}/{total} files")
+    print(f"  📊 Total tests: {len(merged)}")
+    print()
+
+    if failures and not merged:
+        raise RuntimeError("All benchmark drivers failed:\n\n" + "\n\n".join(failures))
+    if failures:
+        # Don't hard-fail if we have some results; surface the errors for debugging.
+        print(f"{'─' * 60}")
+        print("  Failed benchmarks (partial results):")
+        print(f"{'─' * 60}")
+        for msg in failures:
+            print("  ---")
+            for line in msg.splitlines():
+                print(f"  {line}")
+        print()
+
+    fmt = os.environ.get("TL_PERF_REGRESSION_FORMAT", "text").strip().lower()
+    if fmt == "json":
+        print(_RESULTS_JSON_PREFIX + json.dumps(merged, separators=(",", ":")))
+        return
+
+    print(f"{'─' * 60}")
+    print("  Results")
+    print(f"{'─' * 60}")
+    rows = [[k, merged[k]] for k in sorted(merged.keys())]
+    headers = ["Name", "Latency (ms)"]
+    if tabulate is None:
+        print(f"| {headers[0]} | {headers[1]} |")
+        print("|---|---|")
+        for name, latency in rows:
+            print(f"| {name} | {latency} |")
+    else:
+        print(tabulate(rows, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+
+
+if __name__ == "__main__":
+    regression_all()
diff --git a/maint/scripts/run_local_ci_test.sh b/maint/scripts/run_local_ci_test.sh
index f8fe54384..564fa2a50 100755
--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -1,20 +1,197 @@
 #!/bin/bash
 
+# Usage:
+#   bash maint/scripts/run_local_ci_test.sh [DEVICE]
+#   or
+#   bash maint/scripts/run_local_ci_test.sh --device <cuda|hip|cpu|metal>
+#
+# What it does:
+#   - Runs project tests (examples/ and testing/python/) with pytest-xdist.
+#   - Loads the CUDA-aware scheduler plugin (-p pytest_cuda_scheduler).
+#   - When DEVICE=cuda (default): auto-detects available GPUs (n), selects
+#     ceil(n/2) GPUs, and runs W workers per selected GPU (default W=4), i.e.
+#     total workers = W * ceil(n/2). Uses the pytest CUDA scheduler plugin.
+#   - When DEVICE is hip/cpu/metal: no device auto-detection is performed;
+#     runs pytest without the CUDA plugin. Parallelism can be controlled via
+#     $PYTEST_XDIST_WORKERS (default 4 if unset).
+#
+# GPU detection precedence:
+#   1) $PYTEST_XDIST_CUDA_DEVICES (comma-separated, e.g. "0,1,2,3")
+#   2) $CUDA_VISIBLE_DEVICES
+#   3) torch.cuda.device_count() (if torch is available)
+#   4) nvidia-smi -L
+#
+# Environment variables:
+#   - PYTEST_XDIST_CUDA_DEVICES: explicit device list to use; if not set and we
+#     detect GPUs via torch/nvidia-smi, the script exports this variable.
+#   - CUDA_VISIBLE_DEVICES: if set, used as the device list for worker count.
+#   - PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE: W (workers per selected GPU), default 4.
+#   - PYTEST_XDIST_WORKERS: total workers for non-CUDA runs (hip/cpu/metal), default 4.
+#
+# Examples:
+#   - Use all visible GPUs, auto workers:  bash maint/scripts/run_local_ci_test.sh
+#   - Limit to subset of GPUs:             PYTEST_XDIST_CUDA_DEVICES=0,2 bash maint/scripts/run_local_ci_test.sh
+#   - Respect an existing visibility:      CUDA_VISIBLE_DEVICES=0,1 bash maint/scripts/run_local_ci_test.sh
+#   - Increase workers per GPU:            PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE=8 bash maint/scripts/run_local_ci_test.sh
+#
+# Requirements:
+#   - pytest, pytest-xdist
+#   - torch (optional) or nvidia-smi (optional) for auto-detection
+#   - NVIDIA drivers and CUDA-capable GPUs for GPU tests
+
+# Parse args
+DEVICE="cuda"
+if [[ $# -ge 1 ]]; then
+  case "$1" in
+    --device)
+      shift
+      DEVICE="${1:-cuda}"
+      shift || true
+      ;;
+    --device=*)
+      DEVICE="${1#*=}"
+      shift
+      ;;
+    cuda|hip|cpu|metal)
+      DEVICE="$1"
+      shift
+      ;;
+    -h|--help)
+      echo "Usage: $0 [DEVICE] | --device <cuda|hip|cpu|metal>"; exit 0;;
+    *)
+      echo "[WARN] Unrecognized arg '$1'; treating as DEVICE." >&2
+      DEVICE="$1"; shift;;
+  esac
+fi
+
+# Normalize DEVICE to lowercase
+DEVICE=$(echo "$DEVICE" | tr 'A-Z' 'a-z')
+case "$DEVICE" in
+  cuda|hip|cpu|metal) ;;
+  *) echo "[ERROR] Unsupported DEVICE='$DEVICE'. Choose cuda|hip|cpu|metal." >&2; exit 2;;
+esac
+
 # Set ROOT_DIR to the project root (two levels up from this script's directory)
 ROOT_DIR=$(cd "$(dirname "$0")/../.." && pwd)
 
 # Change to the project root directory for local testing of changes
-cd $ROOT_DIR
+cd "$ROOT_DIR" || exit 1
+
+# Add the project root and plugin directory to PYTHONPATH so Python can find local modules and the pytest plugin
+export PYTHONPATH=$ROOT_DIR:$ROOT_DIR/maint/scripts:$PYTHONPATH
+
+# Decide worker count automatically based on selected DEVICE.
+# - For cuda: use ceil(num_gpus/2) GPUs, with W workers per GPU (default 4).
+# - For hip/cpu/metal: no device detection; use $PYTEST_XDIST_WORKERS or 4.
+
+detect_device_list() {
+  # Priority: PYTEST_XDIST_CUDA_DEVICES > CUDA_VISIBLE_DEVICES > torch > nvidia-smi
+  if [[ -n "$PYTEST_XDIST_CUDA_DEVICES" ]]; then
+    echo "$PYTEST_XDIST_CUDA_DEVICES"
+    return
+  fi
+  if [[ -n "$CUDA_VISIBLE_DEVICES" ]]; then
+    echo "$CUDA_VISIBLE_DEVICES"
+    return
+  fi
+
+  # Try torch
+  local torch_cnt
+  torch_cnt=$(python - <<'PY' 2>/dev/null
+try:
+    import torch
+    print(torch.cuda.device_count() if torch.cuda.is_available() else 0)
+except Exception:
+    print(-1)
+PY
+  )
+
+  if [[ "$torch_cnt" =~ ^[0-9]+$ ]] && [[ $torch_cnt -ge 1 ]]; then
+    # Build list 0..torch_cnt-1
+    local lst=""
+    for ((i=0;i<torch_cnt;i++)); do
+      if [[ -z "$lst" ]]; then lst="$i"; else lst="$lst,$i"; fi
+    done
+    echo "$lst"
+    return
+  fi
+
+  # Fallback to nvidia-smi
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    local smi_cnt
+    smi_cnt=$(nvidia-smi -L 2>/dev/null | wc -l | awk '{print $1}')
+    if [[ "$smi_cnt" =~ ^[0-9]+$ ]] && [[ $smi_cnt -ge 1 ]]; then
+      local lst=""
+      for ((i=0;i<smi_cnt;i++)); do
+        if [[ -z "$lst" ]]; then lst="$i"; else lst="$lst,$i"; fi
+      done
+      echo "$lst"
+      return
+    fi
+  fi
+
+  echo ""  # no devices detected
+}
+
+compute_workers_from_devices() {
+  local devlist="$1"
+  local workers_per_device="$2"
+  # remove whitespace
+  devlist=$(echo "$devlist" | tr -d ' ')
+  if [[ -z "$devlist" ]]; then
+    echo 0
+    return
+  fi
+  local n
+  n=$(awk -F, '{print NF}' <<< "$devlist")
+  local half
+  half=$(( (n + 1) / 2 ))
+  echo $(( half * workers_per_device ))
+}
+
+# Prepare pytest args and worker counts depending on DEVICE
+PYTEST_ARGS_COMMON=(--verbose --color=yes --durations=0 --showlocals --cache-clear)
+if [[ "$DEVICE" == "cuda" ]]; then
+  DEVLIST=$(detect_device_list)
+
+  # If we had to discover via torch or nvidia-smi (no env set), export PYTEST_XDIST_CUDA_DEVICES
+  if [[ -z "$PYTEST_XDIST_CUDA_DEVICES" && -z "$CUDA_VISIBLE_DEVICES" && -n "$DEVLIST" ]]; then
+    export PYTEST_XDIST_CUDA_DEVICES="$DEVLIST"
+  fi
+
+  # Determine workers per device (sync with plugin via env var)
+  WORKERS_PER_DEVICE_DEFAULT=4
+  WORKERS_PER_DEVICE=${PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE:-$WORKERS_PER_DEVICE_DEFAULT}
+  if ! [[ "$WORKERS_PER_DEVICE" =~ ^[0-9]+$ ]] || [[ "$WORKERS_PER_DEVICE" -le 0 ]]; then
+    WORKERS_PER_DEVICE=$WORKERS_PER_DEVICE_DEFAULT
+  fi
+  export PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE="$WORKERS_PER_DEVICE"
 
-# Add the project root to PYTHONPATH so Python can find local modules
-export PYTHONPATH=$ROOT_DIR:$PYTHONPATH
+  NWORKERS=$(compute_workers_from_devices "$DEVLIST" "$WORKERS_PER_DEVICE")
+  if [[ "$NWORKERS" -le 0 ]]; then
+    echo "[ERROR] No CUDA devices detected. Cannot run GPU tests with pytest_cuda_scheduler." >&2
+    exit 1
+  fi
+  echo "[INFO] DEVICE=cuda; devices: ${DEVLIST:-none}. Workers: $NWORKERS ($WORKERS_PER_DEVICE per ceil(n/2))."
+  PYTEST_ARGS_DEVICE=(-p pytest_cuda_scheduler -n "$NWORKERS")
+else
+  # Non-CUDA: do not auto-detect devices; do not load CUDA plugin
+  NWORKERS_NONCUDA=${PYTEST_XDIST_WORKERS:-4}
+  if ! [[ "$NWORKERS_NONCUDA" =~ ^[0-9]+$ ]] || [[ "$NWORKERS_NONCUDA" -le 0 ]]; then
+    NWORKERS_NONCUDA=4
+  fi
+  echo "[INFO] DEVICE=$DEVICE; running without CUDA plugin. Workers: $NWORKERS_NONCUDA."
+  PYTEST_ARGS_DEVICE=(-n "$NWORKERS_NONCUDA")
+fi
 
-# Run pytest in parallel (4 workers) for all tests in the examples directory
-cd examples
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
-cd ..
+# Run pytest in parallel for all tests in the examples directory
+cd examples || exit 1
+python -m pytest "${PYTEST_ARGS_DEVICE[@]}" . "${PYTEST_ARGS_COMMON[@]}"
+cd .. || exit 1
 
-# Run pytest in parallel (4 workers) for all tests in the testing/python directory
-cd testing/python
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
-cd ..
+# Run pytest in parallel for all tests in the testing/python directory.
+# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
+# Do NOT export it globally here, or you'll silently change the default GEMM selection
+# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
+cd testing/python || exit 1
+python -m pytest "${PYTEST_ARGS_DEVICE[@]}" . "${PYTEST_ARGS_COMMON[@]}"
diff --git a/maint/scripts/run_perf_regression.sh b/maint/scripts/run_perf_regression.sh
new file mode 100755
index 000000000..90edfeddc
--- /dev/null
+++ b/maint/scripts/run_perf_regression.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+# Performance regression test: compare current branch vs origin/main
+#
+# Usage:
+#   ./maint/scripts/run_perf_regression.sh
+#
+# Environment variables:
+#   PYTHON_VERSION  - Python version to use (default: 3.12)
+#   WORK_DIR        - Working directory for venvs (default: .perf_regression)
+#   SKIP_BUILD_NEW  - Set to 1 to skip rebuilding new venv
+#   SKIP_BUILD_OLD  - Set to 1 to skip rebuilding old venv
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+PYTHON_VERSION="${PYTHON_VERSION:-3.12}"
+WORK_DIR="${WORK_DIR:-${REPO_ROOT}/.perf_regression}"
+RESULT_MD="${WORK_DIR}/regression_result.md"
+RESULT_PNG="${WORK_DIR}/regression_result.png"
+
+UPSTREAM_URL="https://github.com/tile-ai/tilelang"
+
+# Check if user has a local build directory that might conflict with pip install
+BUILD_DIR="${REPO_ROOT}/build"
+BUILD_BACKUP=""
+if [[ -d "${BUILD_DIR}" ]]; then
+    BUILD_BACKUP="${BUILD_DIR}.bak.$$"
+    echo "Found existing build directory, will rename to ${BUILD_BACKUP}"
+fi
+
+echo "============================================"
+echo "Performance Regression Test"
+echo "============================================"
+echo "Repo root:      ${REPO_ROOT}"
+echo "Work dir:       ${WORK_DIR}"
+echo "Python version: ${PYTHON_VERSION}"
+echo "Upstream:       ${UPSTREAM_URL}"
+echo ""
+
+cd "${REPO_ROOT}"
+
+# Ensure origin points to the correct upstream
+ORIGIN_URL="$(git remote get-url origin 2>/dev/null || echo "")"
+if [[ "${ORIGIN_URL}" != *"tile-ai/tilelang"* ]]; then
+    echo "WARNING: origin (${ORIGIN_URL}) does not point to ${UPSTREAM_URL}"
+    echo "Adding 'upstream' remote..."
+    git remote remove upstream 2>/dev/null || true
+    git remote add upstream "${UPSTREAM_URL}"
+    REMOTE="upstream"
+else
+    REMOTE="origin"
+fi
+echo "Using remote: ${REMOTE}"
+
+# Check for uncommitted changes
+if [[ -n "$(git status --porcelain)" ]]; then
+    echo "WARNING: You have uncommitted changes. They will be stashed."
+    read -p "Continue? [y/N] " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Aborted."
+        exit 1
+    fi
+    STASHED=1
+    git stash push -m "perf_regression_temp_stash"
+else
+    STASHED=0
+fi
+
+# Save current branch/commit
+CURRENT_REF="$(git rev-parse --abbrev-ref HEAD)"
+if [[ "${CURRENT_REF}" == "HEAD" ]]; then
+    # Detached HEAD
+    CURRENT_REF="$(git rev-parse HEAD)"
+fi
+echo "Current ref: ${CURRENT_REF}"
+
+# Cleanup function
+cleanup() {
+    echo ""
+    echo "Cleaning up..."
+    cd "${REPO_ROOT}"
+    git checkout "${CURRENT_REF}" 2>/dev/null || true
+    git submodule update --init --recursive 2>/dev/null || true
+    if [[ "${STASHED}" == "1" ]]; then
+        echo "Restoring stashed changes..."
+        git stash pop || true
+    fi
+    # Restore original build directory if it was backed up
+    if [[ -n "${BUILD_BACKUP}" && -d "${BUILD_BACKUP}" ]]; then
+        echo "Restoring original build directory..."
+        rm -rf "${BUILD_DIR}" 2>/dev/null || true
+        mv "${BUILD_BACKUP}" "${BUILD_DIR}"
+    fi
+}
+trap cleanup EXIT
+
+# Create work directory
+mkdir -p "${WORK_DIR}"
+
+# Fetch latest main from upstream
+echo ""
+echo "Fetching ${REMOTE}/main..."
+git fetch "${REMOTE}" main
+
+# Backup existing build directory to avoid conflict with pip install
+if [[ -n "${BUILD_BACKUP}" ]]; then
+    echo "Renaming build -> ${BUILD_BACKUP##*/}"
+    mv "${BUILD_DIR}" "${BUILD_BACKUP}"
+fi
+
+# ============================================
+# Build NEW version (current branch)
+# ============================================
+if [[ "${SKIP_BUILD_NEW}" != "1" ]]; then
+    echo ""
+    echo "============================================"
+    echo "Building NEW version (current branch: ${CURRENT_REF})"
+    echo "============================================"
+
+    git checkout "${CURRENT_REF}"
+    git submodule update --init --recursive
+
+    rm -rf "${WORK_DIR}/new"
+    uv venv --python "${PYTHON_VERSION}" "${WORK_DIR}/new"
+    source "${WORK_DIR}/new/bin/activate"
+
+    uv pip install -v -r requirements-test.txt
+    uv pip install -v .
+
+    deactivate
+else
+    echo "Skipping NEW build (SKIP_BUILD_NEW=1)"
+fi
+
+# ============================================
+# Build OLD version (upstream main)
+# ============================================
+if [[ "${SKIP_BUILD_OLD}" != "1" ]]; then
+    echo ""
+    echo "============================================"
+    echo "Building OLD version (${REMOTE}/main)"
+    echo "============================================"
+
+    # Clean build artifacts before switching
+    # Note: -e requires relative paths, not absolute paths
+    git clean -dxf -e .perf_regression/ -e .cache/ -e "*.egg-info" -e "build.bak.*"
+
+    git checkout "${REMOTE}/main"
+    git submodule update --init --recursive
+
+    rm -rf "${WORK_DIR}/old"
+    uv venv --python "${PYTHON_VERSION}" "${WORK_DIR}/old"
+    source "${WORK_DIR}/old/bin/activate"
+
+    uv pip install -v -r requirements-test.txt
+    uv pip install -v .
+
+    deactivate
+else
+    echo "Skipping OLD build (SKIP_BUILD_OLD=1)"
+fi
+
+# ============================================
+# Run regression test
+# ============================================
+echo ""
+echo "============================================"
+echo "Running performance regression test"
+echo "============================================"
+
+# Switch back to current branch for running the test script
+git checkout "${CURRENT_REF}"
+git submodule update --init --recursive
+
+source "${WORK_DIR}/new/bin/activate"
+
+OLD_PYTHON="${WORK_DIR}/old/bin/python" \
+NEW_PYTHON="${WORK_DIR}/new/bin/python" \
+PERF_REGRESSION_MD="${RESULT_MD}" \
+PERF_REGRESSION_PNG="${RESULT_PNG}" \
+    python "${SCRIPT_DIR}/test_perf_regression.py"
+
+deactivate
+
+echo ""
+echo "============================================"
+echo "Results"
+echo "============================================"
+echo "Markdown: ${RESULT_MD}"
+echo "Plot:     ${RESULT_PNG}"
+echo ""
+cat "${RESULT_MD}"
diff --git a/maint/scripts/test_perf_regression.py b/maint/scripts/test_perf_regression.py
new file mode 100644
index 000000000..010bc731c
--- /dev/null
+++ b/maint/scripts/test_perf_regression.py
@@ -0,0 +1,236 @@
+import contextlib
+import subprocess
+import re
+import os
+import json
+from tabulate import tabulate
+import pandas as pd
+import numpy as np
+import textwrap
+from pathlib import Path
+
+try:
+    import tilelang
+
+    tilelang.disable_cache()
+    os.environ["TILELANG_DISABLE_CACHE"] = "1"
+except Exception:
+    tilelang = None
+
+OLD_PYTHON = os.environ.get("OLD_PYTHON", "./old/bin/python")
+NEW_PYTHON = os.environ.get("NEW_PYTHON", "./new/bin/python")
+OUT_MD = os.environ.get("PERF_REGRESSION_MD", "regression_result.md")
+OUT_PNG = os.environ.get("PERF_REGRESSION_PNG", "regression_result.png")
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def parse_output(output):
+    for line in output.splitlines():
+        if line.startswith(_RESULTS_JSON_PREFIX):
+            return json.loads(line[len(_RESULTS_JSON_PREFIX) :])
+
+    # Fallback to regex parsing
+    data = {}
+    for line in output.split("\n"):
+        line = line.strip()
+        m = re.match(r"\|\s*([^\|]+)\s*\|\s*([0-9\.]+)\s*\|", line)
+        if m is not None:
+            with contextlib.suppress(ValueError):
+                data[m.group(1)] = float(m.group(2))
+    return data
+
+
+def run_cmd(cmd, env=None):
+    full_env = os.environ.copy()
+    if env:
+        full_env.update(env)
+    # Stream output in real-time while capturing it
+    p = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=None,
+        text=True,
+        env=full_env,
+    )
+    stdout_lines = []
+    assert p.stdout is not None
+    for line in p.stdout:
+        stdout_lines.append(line)
+        # Don't print the JSON result line
+        if not line.startswith(_RESULTS_JSON_PREFIX):
+            print(line, end="", flush=True)
+    p.wait()
+    if p.returncode != 0:
+        stdout_content = "".join(stdout_lines)
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}\nSTDOUT:\n{stdout_content}")
+    return "".join(stdout_lines)
+
+
+def draw(df: pd.DataFrame) -> None:
+    import matplotlib.pyplot as plt
+
+    if df is None or len(df) == 0:
+        return
+
+    # ---- copy + sanitize ----
+    df = df.copy()
+    df["Speedup"] = pd.to_numeric(df["Speedup"], errors="coerce")
+    df = df.dropna(subset=["Speedup"])
+
+    # categorize
+    df["Performance"] = np.where(df["Speedup"] >= 1.0, "Improved", "Regressed")
+    df["DeltaPct"] = (df["Speedup"] - 1.0) * 100.0
+
+    # sort: worst regressions at top? (common for dashboards)
+    # If you prefer best-to-worst, change ascending=False
+    df = df.sort_values("Speedup", ascending=True).reset_index(drop=True)
+
+    # ---- style ----
+    plt.rcParams.update(
+        {
+            "figure.dpi": 120,
+            "savefig.dpi": 300,
+            "axes.titlesize": 16,
+            "axes.labelsize": 12,
+            "xtick.labelsize": 10,
+            "ytick.labelsize": 10,
+        }
+    )
+
+    n = len(df)
+    # height: ~0.35 inch per row + margins, with a sensible cap/floor
+    fig_h = min(max(6.0, 0.35 * n + 2.2), 22.0)
+    fig_w = 14.0
+    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+    # palette
+    colors = {"Improved": "#2ecc71", "Regressed": "#e74c3c"}
+    bar_colors = df["Performance"].map(colors).tolist()
+
+    # wrap long labels (optional)
+    def wrap_label(s: str, width: int = 42) -> str:
+        return "\n".join(textwrap.wrap(str(s), width=width)) if len(str(s)) > width else str(s)
+
+    ylabels = [wrap_label(x) for x in df["File"].tolist()]
+    y = np.arange(n)
+
+    # bars
+    ax.barh(y, df["Speedup"].values, color=bar_colors, edgecolor="black", linewidth=0.4, height=0.72)
+
+    # baseline at 1.0x
+    ax.axvline(1.0, linestyle="--", linewidth=1.4, alpha=0.85)
+
+    # grid
+    ax.xaxis.grid(True, linestyle="-", linewidth=0.6, alpha=0.25)
+    ax.set_axisbelow(True)
+
+    # y ticks
+    ax.set_yticks(y)
+    ax.set_yticklabels(ylabels)
+
+    # x limits with padding (ensure 1.0 included)
+    x_min = float(df["Speedup"].min())
+    x_max = float(df["Speedup"].max())
+    pad = max(0.02, (x_max - x_min) * 0.12)
+    left = min(1.0, x_min) - pad
+    right = max(1.0, x_max) + pad
+    ax.set_xlim(left, right)
+
+    # annotate each bar
+    for i, (sx, dp) in enumerate(zip(df["Speedup"].values, df["DeltaPct"].values)):
+        label = f"{sx:.3f}x ({dp:+.2f}%)"
+        # place to right for improved, left for regressed (near bar end)
+        if sx >= 1.0:
+            ax.text(sx + 0.003, i, label, va="center", ha="left", fontsize=9)
+        else:
+            ax.text(sx - 0.003, i, label, va="center", ha="right", fontsize=9)
+
+    # labels & title
+    ax.set_xlabel("Speedup Ratio (New / Old)")
+    ax.set_ylabel("Benchmark File")
+    ax.set_title("Performance Regression Analysis")
+
+    # legend
+    from matplotlib.patches import Patch
+
+    legend_handles = [
+        Patch(facecolor=colors["Improved"], edgecolor="black", label="Improved (>= 1.0x)"),
+        Patch(facecolor=colors["Regressed"], edgecolor="black", label="Regressed (< 1.0x)"),
+    ]
+    ax.legend(handles=legend_handles, loc="upper left", frameon=True)
+
+    # summary box
+    num_improved = int((df["Performance"] == "Improved").sum())
+    num_regressed = int((df["Performance"] == "Regressed").sum())
+    best = df.iloc[df["Speedup"].idxmax()]
+    worst = df.iloc[df["Speedup"].idxmin()]
+    summary = (
+        f"Items: {n}\n"
+        f"Improved: {num_improved}\n"
+        f"Regressed: {num_regressed}\n"
+        f"Best:  {best['File']}  {best['Speedup']:.3f}x\n"
+        f"Worst: {worst['File']}  {worst['Speedup']:.3f}x"
+    )
+    ax.text(
+        0.99,
+        0.01,
+        summary,
+        transform=ax.transAxes,
+        ha="right",
+        va="bottom",
+        fontsize=9,
+        bbox=dict(boxstyle="round,pad=0.45", facecolor="white", edgecolor="0.3", alpha=0.9),
+    )
+
+    # clean spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    fig.tight_layout()
+    print(f"Saving plot to {OUT_PNG} ({fig_w:.1f}x{fig_h:.1f} inches)")
+    fig.savefig(OUT_PNG, bbox_inches="tight")
+    # Optional: also save SVG
+    # fig.savefig(OUT_PNG.replace(".png", ".svg"), bbox_inches="tight")
+    plt.close(fig)
+
+
+env = {"TL_PERF_REGRESSION_FORMAT": "json"}
+print("Running regression on OLD version...")
+test_file = Path(__file__).parent / "regression_all.py"
+output_v1 = run_cmd([OLD_PYTHON, str(test_file)], env=env)
+print("Running regression on NEW version...")
+output_v2 = run_cmd([NEW_PYTHON, str(test_file)], env=env)
+
+data_v1 = parse_output(output_v1)
+data_v2 = parse_output(output_v2)
+
+common_keys = sorted(set(data_v1) & set(data_v2))
+if not common_keys:
+    print("No common entries between old and new versions")
+    # Write empty file or message
+    with open(OUT_MD, "w") as f:
+        f.write("No common benchmarks found between old and new versions.\n")
+    exit(0)
+
+table = []
+for key in common_keys:
+    if data_v2[key] == 0:
+        speedup = 0.0
+    else:
+        speedup = data_v1[key] / data_v2[key]
+    table.append([key, data_v1[key], data_v2[key], speedup])
+
+if not table:
+    raise RuntimeError("All results are invalid")
+
+table.sort(key=lambda x: x[-1])
+
+headers = ["File", "Original Latency", "Current Latency", "Speedup"]
+
+with open(OUT_MD, "w") as f:
+    f.write(tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+    f.write("\n")
+
+df = pd.DataFrame(table, columns=headers)
+df = df.sort_values("Speedup", ascending=False).reset_index(drop=True)
+draw(df)
diff --git a/pyproject.toml b/pyproject.toml
index 8c417d565..76bb3ba9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,24 +27,47 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "apache-tvm-ffi==0.1.0",
+    # >=0.1.6 fixes a memory issue: tilelang#1502, but keep
+    # requirement as wide as possible to be compatible with other libraries
+    # pip will try to use latest version whenever possible.
+    "apache-tvm-ffi~=0.1.0,>=0.1.2",
+    # torch-c-dlpack-ext provides prebuilt torch extensions.
+    # Without it, TVM FFI may require JIT compilation on first import.
+    "torch-c-dlpack-ext; python_version < '3.14'",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
     "psutil",
     "torch",
-    "torch>=2.7; platform_system == 'Darwin'",
+    "torch>=2.4; platform_system == 'Darwin'",
+    # jit-compiling a torch extension needs setuptools
+    "setuptools; platform_system == 'Darwin'",
     "tqdm>=4.62.3",
     "typing-extensions>=4.10.0",
+    "z3-solver>=4.13.0,<4.15.5",
 ]
 
 [project.optional-dependencies]
 # mldtypes should be greater than 0.5.1
 # if you want to enable fp4
 fp4 = ["ml-dtypes>=0.5.1"]
+# if you want to enable layout inference visualization
+vis = ["matplotlib"]
+# if you want to build with CUDA NVCC support
+nvcc = [
+    "nvidia-cuda-nvcc>=13.0.48",
+    "nvidia-cuda-cccl>=13.0.50",
+]
 
 [build-system]
-requires = ["cython>=3.0.0", "scikit-build-core"]
+requires = [
+    "cython>=3.1.0",
+    "scikit-build-core",
+    "z3-solver>=4.13.0,<4.15.5",
+    # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
+    # See tvm's CMakeLists.txt for more information.
+    "patchelf>=0.17.2; platform_system == 'Linux'",
+]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -104,6 +127,8 @@ tilelang = "tilelang"
 # TVM
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
+"tilelang/3rdparty/tvm/3rdparty/dmlc-core/include" = "3rdparty/tvm/3rdparty/dmlc-core/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
@@ -112,10 +137,7 @@ tilelang = "tilelang"
 "tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
 "tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
 
-[tool.yapf]
-based_on_style = "yapf"
-column_limit = 100
-indent_width = 4
+
 
 [tool.codespell]
 ignore-words = "docs/spelling_wordlist.txt"
@@ -128,7 +150,7 @@ skip = [
 
 [tool.ruff]
 target-version = "py39"
-line-length = 100
+line-length = 140
 output-format = "full"
 
 exclude = [
@@ -136,6 +158,14 @@ exclude = [
     "examples/deepseek_v32/inference",
 ]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
+
 [tool.ruff.lint.per-file-ignores]
 # Do not upgrade type hint in testing and examples.
 # See https://github.com/tile-ai/tilelang/issues/1079 for more information.
@@ -211,12 +241,10 @@ environment.PYTHONDEVMODE = "1"
 environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
 environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
-# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
-# TODO: upgrade to manylinux_2_28 at some time
-manylinux-x86_64-image = "manylinux2014"   # CentOS 7
-manylinux-aarch64-image = "manylinux_2_28" # AlmaLinux 8
+manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_34"  # Z3 requires
 # Install CUDA runtime and stub driver library
-# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
+# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
 set -eux
 
@@ -225,8 +253,8 @@ uname -a
 
 case "$(uname -m)" in
     "x86_64")
-        DEFAULT_CUDA_VERSION="12.1"
-        yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        DEFAULT_CUDA_VERSION="12.8"
+        dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
         ;;
     "aarch64")
         DEFAULT_CUDA_VERSION="12.8"
@@ -239,10 +267,12 @@ esac
 
 cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
-yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
+dnf install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}"
+dnf clean all
 """
 repair-wheel-command = [
-    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
+    # Do not bundle libtvm_ffi.so and libz3.so because they are shipped by the dependency packages.
+    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
     "pipx run abi3audit --verbose --strict {wheel}",
 ]
 
@@ -254,7 +284,8 @@ repair-wheel-command = [
 
 [[tool.cibuildwheel.overrides]]
 select = "*linux*x86_64*"
-# CentOS 7 is too old to run import test. Do wheel installation test only.
-test-command = [
-    "echo 'Wheel is installed successfully'",
+# x86_64 runners in GitHub Actions have limited storage,
+# pre-install torch without caching to reduce disk usage during install tilelang.
+before-test = [
+    "pip install torch --no-cache-dir",
 ]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6cd968731..16e7a4d7a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,15 +1,17 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi~=0.1.0,>=0.1.6
 build
 cmake>=3.26
 cython>=3.0.0
 ninja
 packaging
+pre-commit>=4.0.0
 scikit-build-core
 setuptools>=61
 torch
 wheel
+z3-solver>=4.13.0
 
 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'
diff --git a/requirements-lint.txt b/requirements-lint.txt
index e64eee160..bbb167aa5 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,6 @@
 # Format and lint requirements
 pre-commit
-clang-format==21.1.2
-clang-tidy==21.1.1
+clang-format==21.1.8
+clang-tidy==21.1.6
 codespell[toml]==2.4.1
-ruff==0.14.3
-yapf==0.43.0
+ruff==0.14.14
diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index 5413ad510..420d6f1f5 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -6,3 +6,6 @@
 
 # CUDA specific requirements
 flash-attn==2.5.8
+cuda-python==12.9.4
+# CuTeDSL (CUTLASS Python DSL with CuTe support)
+nvidia-cutlass-dsl==4.3.5
diff --git a/requirements-test.txt b/requirements-test.txt
index 38bdf2d7b..533cab567 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -19,7 +19,9 @@ docutils
 dtlib
 einops
 flash-linear-attention==0.3.2
+matplotlib
 packaging>=21.0
+pandas
 pytest-durations
 pytest-timeout
 pytest-xdist>=2.2.1
@@ -27,6 +29,8 @@ pytest>=6.2.4
 pyyaml
 requests
 scipy
+seaborn
 tabulate
 tornado
 wheel
+z3-solver>=4.13.0
diff --git a/requirements.txt b/requirements.txt
index 3ad186ed4..6735d178b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 # Runtime requirements
-apache-tvm-ffi~=0.1.0
+
+apache-tvm-ffi~=0.1.0,>=0.1.6
+torch-c-dlpack-ext
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
@@ -8,3 +10,4 @@ torch
 torch>=2.7; platform_system == 'Darwin'
 tqdm>=4.62.3
 typing-extensions>=4.10.0
+z3-solver>=4.13.0
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 000000000..189983148
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,38 @@
+/*!
+ * \file tl/config.h
+ * \brief TileLang configuration utilities.
+ */
+
+#ifndef TVM_TL_CONFIG_H_
+#define TVM_TL_CONFIG_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace tl {
+namespace tl_config {
+
+/*!
+ * \brief Check if vectorize planner verbose output is enabled.
+ */
+inline bool VectorizePlannerVerboseEnabled() {
+  auto ctxt = transform::PassContext::Current();
+  return ctxt
+      ->GetConfig("tl.enable_vectorize_planner_verbose", Optional<Bool>())
+      .value_or(Bool(false));
+}
+
+/*!
+ * \brief Check if 256-bit vectorization is disabled.
+ */
+inline bool Vectorize256Disabled() {
+  auto ctxt = transform::PassContext::Current();
+  return ctxt->GetConfig("tl.disable_vectorize_256", Optional<Bool>())
+      .value_or(Bool(false));
+}
+
+} // namespace tl_config
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_CONFIG_H_
diff --git a/src/ir.cc b/src/ir.cc
index 3d2b3ecdc..c65e79613 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -13,6 +13,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/script/ir_builder/tir/ir.h>
+#include <tvm/tir/analysis.h>
 
 #include <utility>
 
@@ -44,16 +45,22 @@ static ForFrame MakeIterVarFrame(const std::string &name, const PrimExpr &dom) {
   n->vars.push_back(var);
   n->doms.push_back(Range(0, dom));
   n->f_make_for_loop = [](const Array<Var> &vars, const Array<Range> &doms,
-                          const Stmt &body) -> Stmt {
+                          const Array<Optional<PrimExpr>> &steps,
+                          Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), 1);
     ICHECK_EQ(doms.size(), 1);
-    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
+               /*thread_binding=*/std::nullopt,
+               /*annotations=*/tvm::ffi::Map<tvm::ffi::String, tvm::ffi::Any>{},
+               /*step=*/step);
   };
   return ForFrame(n);
 }
 
 ForFrame ParallelFor(const Array<PrimExpr> &extents,
-                     const Map<String, ObjectRef> &annotations) {
+                     const Map<String, tvm::ffi::Any> &annotations) {
   using namespace tvm::tir;
   ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.reserve(extents.size());
@@ -63,16 +70,31 @@ ForFrame ParallelFor(const Array<PrimExpr> &extents,
     n->vars.push_back(Var("v", extent.dtype()));
     n->doms.push_back(Range(make_const(dtype, 0), extent));
   }
-  n->f_make_for_loop = [annotations](const Array<Var> &vars,
-                                     const Array<Range> &doms,
-                                     Stmt body) -> Stmt {
+  n->f_make_for_loop =
+      [annotations](const Array<Var> &vars, const Array<Range> &doms,
+                    const Array<Optional<PrimExpr>> &steps, Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     for (int i = n - 1; i >= 0; --i) {
       Range dom = doms[i];
       Var var = vars[i];
+      Optional<PrimExpr> step =
+          i < steps.size() ? steps[i] : Optional<PrimExpr>(std::nullopt);
+      // Only attach annotations to the outermost parallel loop.
+      // Rationale: In TileLang's design, inner loops cannot govern or annotate
+      // their outer loops, while the outermost loop can manage and transform
+      // the entire nested region. Placing the layout on the outermost loop
+      // lets lowering/validators reason about and rewrite the whole nest.
+      // Layout annotations (like parallel_loop_layout) and other hints are
+      // read from the outermost loop.
+      Map<String, tvm::ffi::Any> loop_annotations;
+      if (i == 0) {
+        loop_annotations = annotations;
+      }
       body = For(var, dom->min, dom->extent, ForKind::kParallel, body,
-                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations);
+                 /*thread_binding=*/std::nullopt,
+                 /*annotations=*/loop_annotations,
+                 /*step=*/step);
     }
     return body;
   };
@@ -90,23 +112,25 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
   n->vars.push_back(Var("v", dtype));
   n->doms.push_back(Range(std::move(start), stop));
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
+                           const Array<Optional<PrimExpr>> &steps,
                            Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     ICHECK(n == 1);
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     if (num_stages > 0)
       anno.Set("num_stages", PrimExpr(num_stages));
     if (!order.empty())
       anno.Set("tl_pipeline_order", order);
     if (!stages.empty())
       anno.Set("tl_pipeline_stage", stages);
-    if (!sync.empty())
-      anno.Set("tl_pipeline_sync", sync);
     if (!groups.empty())
       anno.Set("tl_pipeline_group", groups);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
     body = For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
-               /*thread_binding=*/std::nullopt, /*annotations=*/anno);
+               /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+               /*step=*/step);
     return body;
   };
   return ForFrame(n);
@@ -145,9 +169,10 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
   grouped_domain.push_back(group_size);
 
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
-                           const Stmt &body) -> Stmt {
+                           const Array<Optional<PrimExpr>> &steps,
+                           Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     Array<PrimExpr> idxs(grouped_domain.size(), PrimExpr());
     PrimExpr rem = loop_var * wave_size + index;
 
@@ -168,8 +193,11 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
     if (analyzer.CanProveGreaterEqual(waves, 2)) {
       new_body = SeqStmt({out_if, body});
     }
-    Stmt outer =
-        For(loop_var, 0, waves, ForKind::kSerial, new_body, std::nullopt, anno);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    Stmt outer = For(loop_var, 0, waves, ForKind::kSerial, new_body,
+                     /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+                     /*step=*/step);
     for (int i = 0; i < vars.size() - 1; ++i) {
       outer = tvm::tir::LetStmt(vars[i], idxs[i + 1], outer);
     }
@@ -390,7 +418,9 @@ WarpSpecializeFrame WarpSpecialize(const Array<IntImm> &warp_group_ids,
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.WarpSpecialize", WarpSpecialize);
+  refl::GlobalDef()
+      .def("tl.WarpSpecialize", WarpSpecialize)
+      .def("tl.SideEffect", SideEffect);
   KernelLaunchFrameNode::RegisterReflection();
   WarpSpecializeFrameNode::RegisterReflection();
 }
diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
index fe9ec04b3..82beed07a 100644
--- a/src/layout/gemm_layouts.cc
+++ b/src/layout/gemm_layouts.cc
@@ -487,11 +487,23 @@ Layout makeGemmABLayoutF64_Kouter(int stride, int continuous) {
   return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
 }
 
-// The Default Layout for Tensor Access
-Layout makeGemmLayoutLinear(int stride, int continuous) {
-  IterVar i = make_itervar("i", stride);
-  IterVar j = make_itervar("j", continuous);
-  return Layout(Array{i, j}, {i * continuous + j});
+// The Default Layout for Tensor Access (row-major linear layout)
+Layout makeLinearLayout(Array<PrimExpr> shape) {
+  int ndim = static_cast<int>(shape.size());
+  Array<IterVar> iter_vars;
+  for (int i = 0; i < ndim; i++) {
+    iter_vars.push_back(make_itervar(std::string{char('i' + i)}, shape[i]));
+  }
+  // Row-major: index = i0 * (d1 * d2 * ...) + i1 * (d2 * ...) + ... + i_{n-1}
+  PrimExpr linear_index = 0;
+  for (int i = 0; i < ndim; i++) {
+    PrimExpr stride = 1;
+    for (int j = i + 1; j < ndim; j++) {
+      stride = stride * shape[j];
+    }
+    linear_index = linear_index + iter_vars[i]->var * stride;
+  }
+  return Layout(iter_vars, {linear_index});
 }
 
 Layout makeGemmABLayoutPadded(int stride, int continuous, int element_size) {
@@ -745,20 +757,30 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
     if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
+    // fallback for float64 when stride % 8 != 0
+    if (mat_stride % 8 != 0)
+      return makeLinearLayout(
+          Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
     return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
                                         element_size);
   }
   int vector_size = 128 / element_size;
 
-  if (mat_continuous % (vector_size * 8) == 0)
-    return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 4) == 0)
-    return makeHalfBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 2) == 0)
-    return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                        element_size);
-  else if (mat_continuous % vector_size == 0)
-    return makeGemmLayoutLinear(mat_stride, mat_continuous);
+  if (mat_stride % 8 == 0) {
+    if (mat_continuous % (vector_size * 8) == 0)
+      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 4) == 0)
+      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 2) == 0)
+      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
+                                          element_size);
+  }
+
+  if (mat_continuous % vector_size == 0)
+    return makeLinearLayout(
+        Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
   else
     ICHECK(0) << "Unsupported layout for Hopper with stride=" << mat_stride
               << ", continuous=" << mat_continuous
@@ -771,15 +793,22 @@ Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
     ICHECK(0) << "float64 on sm100 is not supported now";
   }
   int vector_size = 128 / element_size;
-  if (mat_continuous % (vector_size * 8) == 0)
-    return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 4) == 0)
-    return makeHalfBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 2) == 0)
-    return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                        element_size);
-  else if (mat_continuous % vector_size == 0)
-    return makeGemmLayoutLinear(mat_stride, mat_continuous);
+
+  if (mat_stride % 8 == 0) {
+    if (mat_continuous % (vector_size * 8) == 0)
+      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 4) == 0)
+      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 2) == 0)
+      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
+                                          element_size);
+  }
+
+  if (mat_continuous % vector_size == 0)
+    return makeLinearLayout(
+        Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
   else
     ICHECK(0) << "Unsupported layout for sm100 with stride=" << mat_stride
               << ", continuous=" << mat_continuous
@@ -791,5 +820,61 @@ Layout makeGemmABLayoutCDNA(int stride, int continuous, int element_size,
                             int kPack) {
   return makeMatrixCoreSwizzleLayout(stride, continuous, element_size, kPack);
 }
+
+SwizzleMode DetectSwizzleMode(const Layout &layout, int stride, int continuous,
+                              int element_size) {
+  int vector_size = 128 / element_size;
+
+  // Check from smallest to largest granularity
+  // Need to verify stride and continuous constraints before comparing
+  if (stride % 8 == 0 && continuous % (vector_size * 2) == 0) {
+    if (StructuralEqual()(layout, makeQuarterBankSwizzleLayout(
+                                      stride, continuous, element_size))) {
+      return SwizzleMode::kQuarter;
+    }
+  }
+  if (stride % 8 == 0 && continuous % (vector_size * 4) == 0) {
+    if (StructuralEqual()(layout, makeHalfBankSwizzleLayout(stride, continuous,
+                                                            element_size))) {
+      return SwizzleMode::kHalf;
+    }
+  }
+  if (stride % 8 == 0 && continuous % (vector_size * 8) == 0) {
+    if (StructuralEqual()(layout, makeFullBankSwizzleLayout(stride, continuous,
+                                                            element_size))) {
+      return SwizzleMode::kFull;
+    }
+  }
+  return SwizzleMode::kNone;
+}
+
+Optional<Layout> MergeSwizzleLayouts(const Layout &layout1,
+                                     const Layout &layout2, int stride,
+                                     int continuous, int element_size) {
+  SwizzleMode mode1 =
+      DetectSwizzleMode(layout1, stride, continuous, element_size);
+  SwizzleMode mode2 =
+      DetectSwizzleMode(layout2, stride, continuous, element_size);
+
+  // If either is not a swizzle layout, cannot merge
+  if (mode1 == SwizzleMode::kNone || mode2 == SwizzleMode::kNone) {
+    return std::nullopt;
+  }
+
+  // Take the smaller swizzle granularity (smaller enum value)
+  SwizzleMode min_mode = std::min(mode1, mode2);
+
+  switch (min_mode) {
+  case SwizzleMode::kQuarter:
+    return makeQuarterBankSwizzleLayout(stride, continuous, element_size);
+  case SwizzleMode::kHalf:
+    return makeHalfBankSwizzleLayout(stride, continuous, element_size);
+  case SwizzleMode::kFull:
+    return makeFullBankSwizzleLayout(stride, continuous, element_size);
+  default:
+    return std::nullopt;
+  }
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 892f13770..64d430a53 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -12,6 +12,8 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include "arith/pattern_match.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
 #include "utils.h"
 
 namespace tvm {
@@ -78,7 +80,8 @@ void LayoutNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<LayoutNode>()
       .def_ro("input_size", &LayoutNode::input_size_)
-      .def_ro("forward_index", &LayoutNode::forward_index_);
+      .def_ro("forward_index", &LayoutNode::forward_index_)
+      .def("_DebugOutput", &LayoutNode::DebugOutput);
 }
 
 void LayoutNode::UpdateAnalyzer(arith::Analyzer *analyzer) const {
@@ -102,10 +105,24 @@ Array<PrimExpr> LayoutNode::OutputShape() const {
   for (size_t i = 0; i < ret.size(); i++) {
     auto ist = analyzer.int_set(forward_index_[i] + 1);
     if (arith::is_neg_inf(ist.min()) && arith::is_pos_inf(ist.max())) {
-      // X-OR Expression
-      ret.Set(i, input_size_[i]);
+      // Analyzer couldn't form an IntervalSet (e.g. bitwise ops).
+      // Fall back to ConstIntBound to derive a safe extent.
+      auto cib = analyzer.const_int_bound(forward_index_[i]);
+      if (cib->min_value != arith::ConstIntBound::kNegInf &&
+          cib->max_value != arith::ConstIntBound::kPosInf &&
+          cib->min_value >= 0) {
+        // extent = max - min + 1, using 64-bit integer literal
+        ret.Set(i, Integer(cib->max_value - cib->min_value + 1));
+      } else {
+        // Last-resort conservative fallback to avoid OOB/crash
+        // Prefer to keep dimension from known input_size_ if available.
+        if (i < input_size_.size()) {
+          ret.Set(i, input_size_[i]);
+        } else {
+          ret.Set(i, Integer(1));
+        }
+      }
     } else {
-      // CHECK(is_one(ist.min())) << ist.min();
       ret.Set(i, ist.max());
     }
   }
@@ -219,7 +236,8 @@ Fragment FragmentNode::DeReplicate() const {
   PrimExpr new_forward_thread = Substitute(forward_thread_, vmap);
   Array<PrimExpr> new_forward_index = {FloorDiv(forward_index_[0], factor)};
   return Fragment(input_size_, new_forward_index, new_forward_thread,
-                  int(*rep_size) / factor, std::nullopt);
+                  int(*rep_size) / factor, std::nullopt)
+      ->BindThreadRange(Range(0, ThreadExtent()));
 }
 
 Fragment FragmentNode::BindThreadRange(Range thread_range) const {
@@ -282,25 +300,204 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
   return {Layout(outputs_shape, backward_index), level};
 }
 
+Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
+                           arith::Analyzer *analyzer,
+                           const PrimExpr rescale_num,
+                           const PrimExpr rescale_den) const {
+
+  // Fast path: if shape is the same, return the original layout
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Layout>(this);
+  }
+
+  // Step 1. Prove the product relation holds under rescale:
+  //   prod(InputShape) * rescale_num == prod(shape) * rescale_den
+  PrimExpr input_shape_product = Integer(1);
+  for (const auto &dim : InputShape()) {
+    input_shape_product *= dim;
+  }
+  PrimExpr shape_product = Integer(1);
+  for (const auto &dim : shape) {
+    shape_product *= dim;
+  }
+
+  // Use provided analyzer if present, otherwise a local fallback to avoid
+  // potential null dereference paths flagged by static analysis.
+  arith::Analyzer fallback_analyzer;
+  arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
+  ICHECK(az->CanProveEqual(input_shape_product * rescale_num,
+                           shape_product * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den;
+
+  // Step 2. Create new forward indices by reshaping
+  // For each dimension in the new shape, we create a placeholder variable
+  Array<Var> new_vars;
+  new_vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
+    az->Bind(var, Range(0, shape[i]));
+    new_vars.push_back(var);
+  }
+  // Step 3. Compute the flat index from new shape indices
+  // flat_index = k0 * (s1 * s2 * ...) + k1 * (s2 * s3 * ...) + ... + kn
+  PrimExpr flat_index = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j) {
+      stride = stride * shape[j];
+    }
+    flat_index = flat_index + new_vars[i] * stride;
+  }
+  // Convert new flat index (in units of new elements) to the old flat index
+  // (in units of old elements) using the rational rescale factor.
+  // old_flat = floor((flat_index * rescale_den) / rescale_num)
+  PrimExpr old_flat_index = floordiv(flat_index * rescale_den, rescale_num);
+  // Step 4. Convert flat index back to original shape indices
+  // For original shape [s0, s1, ..., sm]:
+  // i0 = flat_index // (s1 * s2 * ... * sm)
+  // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
+  // ...
+  Array<PrimExpr> original_indices;
+  PrimExpr remaining = old_flat_index;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j) {
+      stride = stride * InputShape()[j];
+    }
+    original_indices.push_back(floordiv(remaining, stride));
+    remaining = floormod(remaining, stride);
+  }
+  // Step 5. Substitute original indices into forward_index_
+  Array<PrimExpr> new_forward_index;
+  for (const auto &fwd_expr : forward_index_) {
+    PrimExpr substituted = fwd_expr;
+    // Replace each InputPlaceholder(i) with original_indices[i]
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      substituted =
+          Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
+    }
+    new_forward_index.push_back(az->Simplify(substituted));
+  }
+  for (size_t i = 0; i < new_vars.size(); ++i) {
+    new_forward_index =
+        Substitute(new_forward_index, {{new_vars[i], InputPlaceholder(i)}});
+  }
+  return Layout(shape, new_forward_index);
+}
+
+Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
+                             arith::Analyzer *analyzer,
+                             const PrimExpr rescale_num,
+                             const PrimExpr rescale_den) const {
+
+  // Fast path: identical input shape, return self
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Fragment>(this);
+  }
+
+  // 1) Prove total number of elements remains the same
+  PrimExpr input_prod = Integer(1);
+  for (const auto &d : InputShape())
+    input_prod *= d;
+  PrimExpr shape_prod = Integer(1);
+  for (const auto &d : shape)
+    shape_prod *= d;
+
+  // Use provided analyzer if present, otherwise a local fallback.
+  arith::Analyzer fallback_analyzer;
+  arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
+  ICHECK(az->CanProveEqual(input_prod * rescale_num, shape_prod * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den
+      << " input fragment layout is = " << DebugOutput();
+
+  // 2) Build flat index from new-shape indices
+  Array<Var> new_vars;
+  new_vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i) {
+    // Cannot use InputPlaceholder(i) here, because it would cause name capture
+    // (variable capture) with InputPlaceholder(i) in upper scopes. Therefore,
+    // we must create a fresh variable here to avoid confusion when
+    // substituting.
+    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
+    az->Bind(var, Range(0, shape[i]));
+    new_vars.push_back(var);
+  }
+
+  PrimExpr flat = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j)
+      stride = stride * shape[j];
+    flat = flat + new_vars[i] * stride;
+  }
+  // Convert to old flat index units using the rational rescale factor.
+  // old_flat = floor((flat * rescale_den) / rescale_num)
+  PrimExpr old_flat = floordiv(flat * rescale_den, rescale_num);
+  // 3) Recover original indices from flat index
+  Array<PrimExpr> orig_indices;
+  PrimExpr remain = old_flat;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j)
+      stride = stride * InputShape()[j];
+    orig_indices.push_back(floordiv(remain, stride));
+    remain = floormod(remain, stride);
+  }
+  // 4) Substitute old placeholders with expressions of new indices
+  Array<PrimExpr> new_forward_index;
+  for (const auto &e : forward_index_) {
+    PrimExpr cur = e;
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      cur = Substitute(cur, {{InputPlaceholder(i), orig_indices[i]}});
+    }
+    cur = az->Simplify(cur);
+    new_forward_index.push_back(cur);
+  }
+  PrimExpr new_forward_thread = forward_thread_;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    new_forward_thread = Substitute(new_forward_thread,
+                                    {{InputPlaceholder(i), orig_indices[i]}});
+  }
+  new_forward_thread = az->Simplify(new_forward_thread);
+  for (size_t i = 0; i < new_vars.size(); ++i) {
+    auto var = new_vars[i];
+    new_forward_index =
+        Substitute(new_forward_index, {{var, InputPlaceholder(i)}});
+    new_forward_thread =
+        Substitute(new_forward_thread, {{var, InputPlaceholder(i)}});
+  }
+  Fragment reshaped(shape, new_forward_index, new_forward_thread,
+                    ReplicateExtent(), std::nullopt);
+  if (thread_range_.defined()) {
+    reshaped = reshaped->BindThreadRange(thread_range_);
+  }
+  return reshaped;
+}
+
 Layout LayoutNode::Inverse() const {
   auto inverse_result = InverseWithLevel();
   return std::move(inverse_result.first);
 }
+
 PrimExpr infer_fragment_index(const Map<Var, Range> &input_iters,
                               const PrimExpr &forward_thread,
                               arith::Analyzer *analyzer) {
-  Array<arith::IterSplitExpr> splits = DivideUnusedIterators(
-      {forward_thread}, ToIterVars(input_iters), analyzer);
-
-  Array<arith::IterSplitExpr> split_without_rep;
-  for (const auto &split : splits) {
-    CHECK(split->source->source.as<Var>());
-    if (split->source->source.as<Var>().value().same_as(
-            ReplicationPlaceholder()))
-      continue;
-    split_without_rep.push_back(split);
+  // we build iter_vars from input_iters, but set _rep to range [0, 1)
+  // to make it not contribute to the index of the forward_idx
+  Array<IterVar> iter_vars;
+  for (const auto &[var, range_] : input_iters) {
+    Range range = range_;
+    if (var.same_as(ReplicationPlaceholder())) {
+      range = Range(0, 1);
+    }
+    iter_vars.push_back(IterVar(range, var, IterVarType::kDataPar));
   }
-  return MakeFlattenedExpression(split_without_rep);
+
+  Array<arith::IterSplitExpr> splits =
+      DivideUnusedIterators({forward_thread}, iter_vars, analyzer);
+  return MakeFlattenedExpression(splits);
 }
 
 FragmentNode::FragmentNode(Array<PrimExpr> input_size,
@@ -355,6 +552,13 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
   data_ = std::move(n);
 }
 
+Fragment Fragment::FullyReplicated(Array<PrimExpr> shape,
+                                   PrimExpr thread_extent) {
+  return Fragment(shape, {}, ReplicationPlaceholder(), thread_extent,
+                  std::nullopt)
+      ->BindThreadRange(Range(0, thread_extent));
+}
+
 // which means the forward_thread is rep_var -> lambda i, rep: rep
 bool FragmentNode::IsCompletedReplicated() const {
   arith::Analyzer analyzer;
@@ -362,6 +566,52 @@ bool FragmentNode::IsCompletedReplicated() const {
                          ReplicationPlaceholder());
 }
 
+arith::IterMapResult FragmentNode::DetectInjective() const {
+  // lei:To perform injective check, we need to reverse the layout
+  // and use surjective check, now we use bijective check for convenience
+  // can be relaxed in future
+  arith::Analyzer analyzer;
+  // Build a flat indices array: [forward_thread_, forward_index_[...]]
+  Array<PrimExpr> indices;
+  indices.push_back(forward_thread_);
+  for (const auto &e : forward_index_) {
+    indices.push_back(e);
+  }
+
+  // Mirror Layout::InverseWithLevel(): if any participating shape is
+  // symbolic, relax to NoCheck and rely on runtime guards elsewhere.
+  auto collect_symbolic = [&](const Array<PrimExpr> &shape) {
+    Array<PrimExpr> symbolic_dims;
+    for (const auto &dim : shape) {
+      if (!as_const_int(dim)) {
+        symbolic_dims.push_back(dim);
+      }
+    }
+    return symbolic_dims;
+  };
+
+  Array<PrimExpr> symbolic_dims = collect_symbolic(InputShape());
+  Array<PrimExpr> output_shape = OutputShape();
+  symbolic_dims.insert(symbolic_dims.end(), output_shape.begin(),
+                       output_shape.end());
+  // Also consider replicate size for fragments
+  if (!as_const_int(ReplicateExtent())) {
+    symbolic_dims.push_back(ReplicateExtent());
+  }
+  symbolic_dims = collect_symbolic(symbolic_dims);
+
+  bool is_static_shape = symbolic_dims.empty();
+  auto level = is_static_shape ? arith::IterMapLevel::Bijective
+                               : arith::IterMapLevel::NoCheck;
+  if (!is_static_shape) {
+    DLOG(WARNING)
+        << "Fragment::DetectInjective on symbolic layout, falling back to "
+        << "NoCheck; symbolic dims: " << symbolic_dims;
+  }
+
+  return arith::DetectIterMap(indices, getVarMap(), 1, level, &analyzer);
+}
+
 PrimExpr FragmentNode::ThreadExtent() const {
   Array<PrimExpr> ret(OutputDim(), 1);
   arith::Analyzer analyzer;
@@ -447,8 +697,24 @@ std::string FragmentNode::DebugOutput() const {
 bool LayoutNode::IsEqual(const LayoutNode *other, bool skip_index) const {
   bool ret = StructuralEqual()(this->InputShape(), other->InputShape());
   ret &= StructuralEqual()(this->OutputShape(), other->OutputShape());
+  if (!ret) {
+    return false;
+  }
   if (!skip_index) {
-    ret &= StructuralEqual()(this->forward_index_, other->forward_index_);
+    // Create common variables for comparison. Using Forward with common
+    // variables ensures we compare the actual mapping rather than AST
+    // structure, since InputPlaceholder may compare equal in StructuralEqual.
+    Array<PrimExpr> common_vars;
+    for (size_t i = 0; i < this->InputDim(); i++) {
+      common_vars.push_back(Var("_cmp_v" + std::to_string(i)));
+    }
+
+    auto this_forward = this->Forward(common_vars);
+    auto other_forward = other->Forward(common_vars);
+
+    if (!StructuralEqual()(this_forward, other_forward)) {
+      return false;
+    }
   }
   return ret;
 }
@@ -469,8 +735,32 @@ bool FragmentNode::IsEqual(const FragmentNode *other, bool skip_index) const {
   ret &= StructuralEqual()(this->OutputShape(), other->OutputShape());
   ret &= StructuralEqual()(this->ReplicateExtent(), other->ReplicateExtent());
   ret &= StructuralEqual()(this->ThreadExtent(), other->ThreadExtent());
+  if (!ret) {
+    return false;
+  }
   if (!skip_index) {
-    ret &= StructuralEqual()(this->forward_index_, other->forward_index_);
+    // Create common variables for comparison. Using Forward/ForwardThread with
+    // common variables ensures we compare the actual mapping rather than AST
+    // structure, since InputPlaceholder may compare equal in StructuralEqual.
+    Array<PrimExpr> common_vars;
+    for (size_t i = 0; i < this->InputDim(); i++) {
+      common_vars.push_back(Var("_cmp_v" + std::to_string(i)));
+    }
+    Var common_rep("_cmp_rep");
+
+    auto this_forward = this->Forward(common_vars);
+    auto other_forward = other->Forward(common_vars);
+
+    if (!StructuralEqual()(this_forward, other_forward)) {
+      return false;
+    }
+
+    // Also compare forward_thread mapping.
+    auto this_thread = this->ForwardThread(common_vars, common_rep);
+    auto other_thread = other->ForwardThread(common_vars, common_rep);
+    if (!StructuralEqual()(this_thread, other_thread)) {
+      return false;
+    }
   }
   return ret;
 }
@@ -479,9 +769,20 @@ void FragmentNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<FragmentNode>()
       .def_ro("forward_thread", &FragmentNode::forward_thread_)
-      .def_ro("replicate_size", &FragmentNode::replicate_size_);
+      .def_ro("replicate_size", &FragmentNode::replicate_size_)
+      .def("_DebugOutput", &FragmentNode::DebugOutput);
 }
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<FragmentNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const FragmentNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    })
+    .set_dispatch<LayoutNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const LayoutNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    });
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
@@ -574,9 +875,15 @@ TVM_FFI_STATIC_INIT_BLOCK() {
              return makeQuarterBankSwizzleLayout(stride, continuous,
                                                  element_size);
            })
-      .def("tl.make_linear_layout", [](int stride, int continuous) {
-        return makeGemmLayoutLinear(stride, continuous);
-      });
+      .def("tl.make_linear_layout",
+           [](Array<PrimExpr> shape) { return makeLinearLayout(shape); })
+      .def("tl.make_gemm_fragment_8x8", []() { return makeGemmFragment8x8(); })
+      .def("tl.make_gemm_fragment_8x8_transposed",
+           []() { return makeGemmFragment8x8Transposed(); })
+      .def("tl.make_fully_replicated_layout_fragment",
+           [](Array<PrimExpr> shape, PrimExpr thread_extent) {
+             return Fragment::FullyReplicated(shape, thread_extent);
+           });
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 97fde85d3..a5aa32235 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -6,6 +6,7 @@
 #ifndef TVM_TL_LAYOUT_LAYOUT_H_
 #define TVM_TL_LAYOUT_LAYOUT_H_
 
+#include <exception>
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/object.h>
@@ -18,6 +19,25 @@ namespace tl {
 
 using namespace tir;
 
+// Common layout-related exceptions
+class LayoutConflictException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LayoutConflictException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
+class LoopLayoutInjectiveException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LoopLayoutInjectiveException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
 class Layout;
 class Fragment;
 
@@ -41,6 +61,20 @@ class LayoutNode : public Object {
   virtual Array<PrimExpr> Forward(const Array<PrimExpr> &vars) const;
 
   virtual Layout Inverse() const;
+
+  // Reshape the layout to a new logical shape. When aliasing buffers of
+  // different dtypes, the element count may change while the underlying
+  // byte-size stays equal. Use rescale_num/rescale_den to represent the
+  // ratio between the old element size and the new element size in bytes.
+  // Specifically, define factor = rescale_num / rescale_den where:
+  //   new_num_elems = old_num_elems * factor
+  // For example, f32->i8 (4B -> 1B) uses rescale_num=4, rescale_den=1.
+  // i8->f32 (1B -> 4B) uses rescale_num=1, rescale_den=4.
+  virtual Layout Reshape(const Array<PrimExpr> &shape,
+                         arith::Analyzer *analyzer,
+                         const PrimExpr rescale_num = Integer(1),
+                         const PrimExpr rescale_den = Integer(1)) const;
+
   virtual std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const;
 
   virtual std::string DebugOutput() const;
@@ -81,6 +115,11 @@ class FragmentNode : public LayoutNode {
   Array<PrimExpr> GetForwardVars() const final;
 
   Layout Inverse() const final;
+
+  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer,
+                 const PrimExpr rescale_num = Integer(1),
+                 const PrimExpr rescale_den = Integer(1)) const;
+
   std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const final;
 
   PrimExpr ThreadExtent() const;
@@ -109,6 +148,8 @@ class FragmentNode : public LayoutNode {
 
   bool IsCompletedReplicated() const;
 
+  arith::IterMapResult DetectInjective() const;
+
   static void RegisterReflection();
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Fragment", FragmentNode, LayoutNode);
@@ -134,6 +175,20 @@ class Fragment : public Layout {
                    PrimExpr forward_thread, PrimExpr replicate_size,
                    Optional<Var> replicate_var);
 
+  /*!
+   * \brief Create a fully replicated fragment layout.
+   *
+   * A fully replicated fragment means all threads hold identical copies of the
+   * entire buffer. This is useful for index buffers or masks that need to be
+   * accessed uniformly across all threads.
+   *
+   * \param shape The shape of the buffer.
+   * \param thread_extent The number of threads.
+   * \return A Fragment where each thread has a complete copy of all elements.
+   */
+  TVM_DLL static Fragment FullyReplicated(Array<PrimExpr> shape,
+                                          PrimExpr thread_extent);
+
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fragment, Layout, FragmentNode);
 };
 
@@ -168,8 +223,8 @@ Fragment makeGemmFragmentACDNA(const int block_m, const int block_n,
                                const int warp_n, const int element_size,
                                const int k_pack, bool transposed = false);
 
-// Default Memory Layout
-Layout makeGemmLayoutLinear(int stride, int continuous);
+// Default Memory Layout (row-major linear layout for any dimension)
+Layout makeLinearLayout(Array<PrimExpr> shape);
 Layout makeGemmABLayoutPadded(int stride, int continuous, int element_size);
 Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
                         int element_size, bool k_inner = true);
@@ -200,9 +255,34 @@ Layout makeHalfBankSwizzleLayout(int stride, int continuous, int element_size);
 Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
                                     int element_size);
 
+// Swizzle mode for shared memory layouts (nvidia only)
+// Smaller enum value = smaller swizzle granularity
+enum class SwizzleMode {
+  kNone = 0,    // Not a swizzle layout (linear or padded)
+  kQuarter = 1, // 32B swizzle (CU_TENSOR_MAP_SWIZZLE_32B)
+  kHalf = 2,    // 64B swizzle (CU_TENSOR_MAP_SWIZZLE_64B)
+  kFull = 3     // 128B swizzle (CU_TENSOR_MAP_SWIZZLE_128B)
+};
+
+// Detect which swizzle mode a layout uses
+SwizzleMode DetectSwizzleMode(const Layout &layout, int stride, int continuous,
+                              int element_size);
+
+// Merge two swizzle layouts by taking the smaller granularity
+// Returns NullOpt if either layout is not a swizzle layout
+Optional<Layout> MergeSwizzleLayouts(const Layout &layout1,
+                                     const Layout &layout2, int stride,
+                                     int continuous, int element_size);
+
 namespace attr {
 // BlockAttr, Containing the layout for all the buffers in the block
 constexpr const char *kLayoutMap = "layout_map";
+// ForAttr, Containing the parallel loop layout for a parallel for loop
+constexpr const char *kParallelLoopLayout = "parallel_loop_layout";
+// ForAttr, Containing the predicate for a parallel for loop
+constexpr const char *kParallelLoopPredicate = "parallel_loop_predicate";
+// ForAttr, Width (in elements) for coalesced memory access
+constexpr const char *kCoalescedWidth = "coalesced_width";
 } // namespace attr
 
 } // namespace tl
diff --git a/src/layout/utils.cc b/src/layout/utils.cc
index a2a788b24..733b73e69 100644
--- a/src/layout/utils.cc
+++ b/src/layout/utils.cc
@@ -5,7 +5,13 @@
  */
 
 #include "utils.h"
+#include "tvm/arith/iter_affine_map.h"
+#include "tvm/ffi/container/map.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
+#include "tvm/node/structural_equal.h"
 
+#include <sstream>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -98,7 +104,9 @@ Array<IterSplitExpr> get_unused_iters(const IterMark &mark,
           << " and " << expected_lower_factor;
       results.emplace_back(
           mark, expected_lower_factor,
-          FloorDiv(splits[lowest]->lower_factor, expected_lower_factor), 1);
+          analyzer->Simplify(
+              FloorDiv(splits[lowest]->lower_factor, expected_lower_factor)),
+          1);
       expected_lower_factor = splits[lowest]->lower_factor;
     } else {
       used[j] = true;
@@ -109,12 +117,111 @@ Array<IterSplitExpr> get_unused_iters(const IterMark &mark,
   bool match_full_iter =
       analyzer->CanProveEqual(expected_lower_factor, mark->extent);
   if (!match_full_iter) {
-    results.emplace_back(mark, expected_lower_factor,
-                         FloorDiv(mark->extent, expected_lower_factor), 1);
+    results.emplace_back(
+        mark, expected_lower_factor,
+        analyzer->Simplify(FloorDiv(mark->extent, expected_lower_factor)), 1);
   }
   return results;
 }
 
+struct IterExprPP {
+  // std::vector<std::pair<std::string, PrimExpr>> marks;
+  ffi::Map<ffi::String, PrimExpr> marks;
+  std::string data;
+
+  IterExprPP(const PrimExpr &expr) { data = Visit(expr); }
+
+  IterExprPP(const IterMark &mark) { data = Visit_(mark.get()); }
+
+  std::string Visit(const PrimExpr &expr) {
+    if (auto *sum = expr.as<IterSumExprNode>()) {
+      return Visit_(sum);
+    } else if (auto *split = expr.as<IterSplitExprNode>()) {
+      return Visit_(split);
+    } else if (auto *var = expr.as<VarNode>()) {
+      return var->name_hint;
+    } else {
+      std::stringstream ss;
+      ss << "<UNKNOWN: " << expr << ">";
+      return ss.str();
+    }
+  }
+
+  std::string Visit_(const IterMarkNode *op) {
+    std::stringstream ss;
+    ss << "(";
+    ss << Visit(op->source);
+    ss << ")";
+    auto res = ss.str();
+    marks.Set(res, op->extent);
+    return res;
+  }
+
+  std::string Visit_(const IterSumExprNode *op) {
+    std::stringstream ss;
+    bool first = true;
+    for (const auto args : op->args) {
+      if (!first) {
+        ss << " + ";
+      } else {
+        first = false;
+      }
+      ss << Visit_(args.get());
+    }
+    return ss.str();
+  }
+
+  std::string Visit_(const IterSplitExprNode *op) {
+    std::stringstream ss;
+    ss << Visit_(op->source.get());
+    if (!is_one(op->lower_factor)) {
+      ss << " / " << op->lower_factor;
+    }
+    ss << " % " << op->extent;
+    if (!is_one(op->scale)) {
+      ss << " * " << op->scale;
+    }
+    return ss.str();
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const IterExprPP &pp) {
+    os << "IterExpr(\n";
+    os << "  expr=" << pp.data << "\n";
+    os << "  iter_mark_extents=";
+    if (pp.marks.empty()) {
+      os << "{}\n";
+    } else {
+      os << "{\n";
+      for (const auto &[k, v] : pp.marks) {
+        os << "    " << k << ": " << v << ",\n";
+      }
+      os << "  }\n";
+    }
+    os << ")";
+    return os;
+  }
+};
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .clear_dispatch<IterMarkNode>()
+    .set_dispatch<IterMarkNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterMarkNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterMark>(node));
+      p->stream << pp;
+    })
+    .clear_dispatch<IterSumExprNode>()
+    .set_dispatch<IterSumExprNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterSumExprNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterSumExpr>(node));
+      p->stream << pp;
+    })
+    .clear_dispatch<IterSplitExprNode>()
+    .set_dispatch<IterSplitExprNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterSplitExprNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterSplitExpr>(node));
+      p->stream << pp;
+    });
+
 // Heuristic: detect per-iterator gaps ("unused" pieces) even when the iterator
 // appears in fused forms across multiple index expressions. We first normalize
 // every index into IterSumExpr, collect all splits per source Var, then
@@ -127,43 +234,40 @@ Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
   });
   IterMarkSplitCollector collector;
   collector.Collect(iter_sum);
-  Array<IterSplitExpr> results;
 
-  for (const IterMark &mark : collector.visited_) {
-    if (!mark->source.as<Var>()) {
-      std::ostringstream oss;
-      oss << "Not a normalized iterator: " << mark;
-      throw NormalizeIterException(oss.str());
-    }
+  std::unordered_map<IterMark, std::vector<IterSplitExpr>, StructuralHash,
+                     StructuralEqual>
+      mark_splits;
+  std::vector<IterMark> mark_order;
+
+  // Step. 1: force add all input_iters to marks (some may not appear in
+  // collector)
+  for (auto &iter : input_iters) {
+    IterMark mark(iter->var, iter->dom->extent);
+    mark_splits[mark] = {};
+    mark_order.push_back(mark);
   }
 
-  for (const IterVar &iter : input_iters) {
-    // Merge splits from all IterMark that share the same source Var as `iter`.
-    std::vector<IterSplitExpr> merged_splits;
-    for (const IterMark &mark : collector.visited_) {
-      auto vexpr = mark->source.as<Var>();
-      if (vexpr && vexpr.value().same_as(iter->var)) {
-        auto it = collector.mark2splits_.find(mark);
-        if (it != collector.mark2splits_.end()) {
-          const auto &vec = it->second;
-          merged_splits.insert(merged_splits.end(), vec.begin(), vec.end());
-        }
-      }
+  // Step. 2: add all collected marks and their splits
+  for (auto &mark : collector.visited_) {
+    if (!mark_splits.count(mark)) {
+      mark_splits[mark] = {};
+      mark_order.push_back(mark);
     }
-
-    if (!merged_splits.empty()) {
-      // Use a unified mark (Var + full extent) to compute the missing pieces
-      // so that fused usages are honored as "used" and not reintroduced.
-      IterMark unified_mark(iter->var, iter->dom->extent);
-      auto splits = get_unused_iters(unified_mark, merged_splits, analyzer);
-      // Put the small axis last for a flattened ordering.
-      results.insert(results.end(), splits.rbegin(), splits.rend());
-    } else if (!is_one(iter->dom->extent)) {
-      auto mark = IterMark(iter->var, iter->dom->extent);
-      auto split = IterSplitExpr(mark, 1, iter->dom->extent, 1);
-      results.push_back(split);
+    for (const auto &splits : collector.mark2splits_[mark]) {
+      mark_splits[mark].push_back(splits);
     }
   }
+
+  Array<IterSplitExpr> results;
+  // Step. 3: process marks in order and collect complement
+  for (const auto &mark : mark_order) {
+    const auto &existing_splits = mark_splits.at(mark);
+    auto complement_splits = get_unused_iters(mark, existing_splits, analyzer);
+    results.insert(results.end(), complement_splits.rbegin(),
+                   complement_splits.rend());
+  }
+
   return results;
 }
 
@@ -273,5 +377,87 @@ Map<Var, Range> ToVMap(const Array<IterVar> &ivs) {
   return result;
 }
 
+// ProveFragmentContains checks whether the threads that access elements of a
+// smaller fragment (small_frag) are a subset of the threads that access
+// elements of a larger fragment (large_frag) for any given loop index. This
+// function ensures that if the small fragment's layout corresponds to the loop
+// itself, accessing the large fragment's elements is valid. Additionally, if
+// small is updated to large, the originally valid access remains valid. The
+// proof is performed by:
+//
+// 1. Defining a variable `rep_small` to represent the replicate index of the
+//    small fragment that is being checked.
+// 2. Using the `small_frag_indices` and `rep_small` to derive the thread
+//    accessing the element in the small fragment.
+// 3. Using `large_frag_indices` to derive the physical index of the large
+//    fragment along with the thread information, and then feeding these into
+//    the inverse of the large fragment to obtain the logical index and
+//    replicate index.
+// 4. Verifying the mapping by checking whether the computed thread using the
+//    inverse layout corresponds to the original thread calculated for the small
+//    fragment. If they don't match, this indicates that the inverse layout's
+//    domain does not include the thread and thus the access is invalid.
+// Thanks @huanqicao for contributing this algorithm.
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           Analyzer &analyzer, bool check_forward_index) {
+  // When check_forward_index is true, verify that the physical indices
+  // (forward index) of both fragments are equal. This is required when
+  // validating loop layout against buffer fragment, as code generation
+  // needs to correctly derive buffer physical indices from loop layout.
+  bool large_physical_is_fully_replicated = large_frag->IsCompletedReplicated();
+  if (large_physical_is_fully_replicated) {
+    return true; // fully replicated fragments are always compatible
+  }
+
+  if (check_forward_index) {
+    auto small_physical = small_frag->Forward(small_frag_indices);
+    auto large_physical = large_frag->Forward(large_frag_indices);
+    // Dimension mismatch means they are not equal.
+    if (small_physical.size() != large_physical.size()) {
+      return false;
+    }
+    // Check each physical index component for equality.
+    for (size_t i = 0; i < small_physical.size(); i++) {
+      auto diff = analyzer.Simplify(small_physical[i] - large_physical[i]);
+      if (!is_zero(diff)) {
+        return false;
+      }
+    }
+  }
+
+  Var rep_small("__checking_frag_contains_rep");
+  analyzer.Bind(rep_small,
+                Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
+                      small_frag->ReplicateExtent()),
+                true); // Bind the replicate extent of small_frag.
+  // Derive thread for small_frag.
+  auto thread = small_frag->ForwardThread(small_frag_indices, rep_small);
+
+  // Get physical index and thread for large_frag.
+  auto large_frag_physical_and_thread = large_frag->Forward(large_frag_indices);
+  // Add small_frag's thread to the large fragment's thread info.
+  large_frag_physical_and_thread.push_back(thread);
+  // Get the inverse of the large fragment.
+  auto inv_large_frag = large_frag->Inverse();
+  // Compute logical index and replicate index using inverse layout.
+  auto inv_large_frag_logical_and_rep =
+      inv_large_frag->Forward(large_frag_physical_and_thread);
+
+  // Extract replicate index from the result.
+  auto inv_large_frag_rep =
+      inv_large_frag_logical_and_rep[inv_large_frag_logical_and_rep.size() - 1];
+
+  // Calculate thread based on the logical index and replicate index.
+  auto check_thread =
+      large_frag->ForwardThread(large_frag_indices, inv_large_frag_rep);
+
+  // Simplify the difference between the threads.
+  auto diff = analyzer.Simplify(thread - check_thread);
+  // If the difference is zero, the threads match and the access is valid.
+  return is_zero(diff);
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/utils.h b/src/layout/utils.h
index 0f03a8617..cae9ecde5 100644
--- a/src/layout/utils.h
+++ b/src/layout/utils.h
@@ -10,6 +10,7 @@
 #include <tvm/arith/iter_affine_map.h>
 
 #include "../support/ffi_aliases.h"
+#include "layout.h"
 
 namespace tvm {
 namespace tl {
@@ -66,6 +67,28 @@ Map<Var, Range> ToVMap(const Array<IterVar> &ivs);
  */
 Array<IterVar> ToIterVars(const Map<Var, Range> &vmap);
 
+/*!
+ * \brief Check whether the threads that access elements of a smaller fragment
+ *        are a subset of the threads that access elements of a larger fragment.
+ *
+ * This function ensures that if the small fragment's layout corresponds to the
+ * loop itself, accessing the large fragment's elements is valid. Additionally,
+ * if small is updated to large, the originally valid access remains valid.
+ *
+ * \param small_frag The smaller fragment to check
+ * \param large_frag The larger fragment to check against
+ * \param small_frag_indices The indices used to access small_frag
+ * \param large_frag_indices The indices used to access large_frag
+ * \param analyzer The analyzer for simplification
+ * \param check_forward_index Whether to also check physical index equality
+ * \return true if small_frag's threads are contained in large_frag's threads
+ */
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           arith::Analyzer &analyzer,
+                           bool check_forward_index = false);
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 57e0d8b78..3fe921e7d 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -5,15 +5,15 @@
  */
 
 #include "./atomic_add.h"
-#include "./region.h"
+#include "./copy.h"
+#include "utils.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../layout/layout.h"
 #include "../target/utils.h"
-#include "../transform/atomicadd_vectorize.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "builtin.h"
 
@@ -23,48 +23,43 @@ namespace tl {
 using namespace tir;
 
 /**
- * @brief Construct an AtomicAdd operator from call arguments and a buffer map.
+ * @brief Construct an AtomicAdd operator from call arguments and annotations.
  *
  * Builds the internal AtomicAddNode, extracts the source and destination
- * regions and their backing Buffers from the first two call-style expressions
- * in `args` (via RegionOp), and stores them along with their ranges. If a third
- * argument is provided, it is interpreted as an integer immediate and stored as
- * the node's coalesced width.
+ * regions and their backing Buffers from the first two region-style expressions
+ * in `args` (BufferLoad/BufferRegion), and stores them along with their
+ * ranges. Annotations are copied directly from the Call node.
  *
  * @param args Call-style PrimExprs where:
  *             - args[0] is the source region call,
- *             - args[1] is the destination region call,
- *             - args[2] (optional) is an IntImm specifying coalesced width.
- * @param vmap Mapping from buffers used by RegionOp to concrete Buffer objects.
- *
+ *             - args[1] is the destination region call.
+ * @param annotations Map containing optional keys:
+ *             - "use_tma": whether to use TMA for memory operations
+ *             - "memory_order": memory order for atomic operations
  * Notes:
- * - The constructor checks that args[0] and args[1] are CallNodes.
+ * - The constructor checks that args[0] and args[1] are region-compatible.
  * - The constructed node is stored in this->data_.
  */
-AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) {
+AtomicAdd::AtomicAdd(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicAdd expects at least 2 arguments (src, dst), got "
+      << args.size();
   ObjectPtr<AtomicAddNode> node = tvm::ffi::make_object<AtomicAddNode>();
-  Array<Range> rgs[2];
-  Buffer bf[2];
-  for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
-  }
-  std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
-  std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
-  if (args.size() >= 3) {
-    node->use_tma = Downcast<IntImm>(args[2]);
-  }
-  node->memory_order = IntImm(0);
-  if (args.size() >= 4) {
-    node->memory_order = Downcast<IntImm>(args[3]);
-  }
-  if (args.size() >= 5) {
-    node->coalesced_width = Downcast<IntImm>(args[4]);
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto region = NormalizeToBufferRegion(args[0]);
+    node->src = region->buffer;
+    node->src_range = region->region;
+  } else {
+    node->src_value = args[0];
   }
+
+  auto region = NormalizeToBufferRegion(args[1]);
+  node->dst = region->buffer;
+  node->dst_range = region->region;
+
+  // Copy annotations from the Call node
+  node->annotations = annotations;
   data_ = std::move(node);
 }
 
@@ -85,71 +80,29 @@ TileOperator AtomicAddNode::Clone() const {
   return AtomicAdd(op);
 }
 
-/**
- * @brief Create data-parallel iteration variables for non-singleton dimensions
- * of the source.
- *
- * Constructs an Array of IterVar corresponding to each dimension in `src_range`
- * whose extent is not equal to 1. Each IterVar has domain Range(0, extent), a
- * Var named sequentially ("i", "j", "k", ...) with the same dtype as the
- * extent, and type IterVarType::kDataPar. The ordering of returned itervars
- * matches the order of dimensions in `src_range`.
- *
- * @return Array<IterVar> Iteration variables for all non-singleton extents in
- * `src_range`.
- */
-Array<IterVar> AtomicAddNode::MakeIterVars() const {
-  Array<IterVar> loop_vars;
-  size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
-      continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
-    idx++;
-    loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
-  }
-  return loop_vars;
-}
+const Op &AtomicAddNode::GetElemOp() const { return atomic_add_elem_op(); }
 
-// ivs: itervars returned by MakeIterVars()
 /**
- * @brief Build index expressions for either source or destination from loop
- * iter vars.
- *
- * Given a list of iteration variables that correspond to the non-singleton
- * extents of the selected region (source when src_dst == 0, destination when
- * src_dst == 1), return an array of index expressions matching the full rank of
- * that region. For dimensions with extent == 1, the corresponding index is the
- * range's minimum; otherwise the index is `min + ivar`.
+ * @brief Get vectorization length based on dst dtype and target SM version.
  *
- * @param ivs Iteration variables in order for all non-singleton dimensions of
- * the chosen region.
- * @param src_dst Selects which region to index: 0 for source (src_range), 1 for
- * destination (dst_range).
- * @return Array<PrimExpr> Index expressions for every dimension of the selected
- * region, in original dimension order.
+ * Returns:
+ *   - 2 for float16/bfloat16
+ *   - 4 for float32 on SM >= 90
+ *   - 1 for all other cases
  *
- * @note The function checks that the number of provided iter vars equals the
- * number of non-singleton extents; it will abort (ICHECK) if they differ.
+ * @param target The target architecture to check SM version.
+ * @return int The vectorization length.
  */
-Array<PrimExpr> AtomicAddNode::MakeIndices(const Array<IterVar> &ivs,
-                                           int src_dst) const {
-  Array<PrimExpr> indices;
-  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
-  size_t idx = 0;
-  for (size_t i = 0; i < ranges.size(); i++) {
-    if (is_one(ranges[i]->extent))
-      indices.push_back(ranges[i]->min);
-    else {
-      indices.push_back(ranges[i]->min + ivs[idx]->var);
-      idx++;
-    }
+int AtomicAddNode::GetVectorizeLength(Target target) const {
+  DataType dtype = dst->dtype;
+  if (dtype.is_float16() || dtype.is_bfloat16()) {
+    return 2;
+  }
+  if (dtype.is_float() && dtype.bits() == 32 &&
+      TargetHasSMVersionGE(target, 90)) {
+    return 4;
   }
-  ICHECK(idx == ivs.size())
-      << "idx = " << idx << ", ivs.size() = " << ivs.size()
-      << "src name = " << src->name << ", dst name = " << dst->name;
-  return indices;
+  return 1;
 }
 
 std::pair<Array<PrimExpr>, PrimExpr>
@@ -164,62 +117,6 @@ AtomicAddNode::ReturnIndicesAndSize(int src_dst) const {
   return {indices, size};
 }
 
-/**
- * @brief Build a combined bound-check predicate for indexed access.
- *
- * Constructs an AND'd predicate ensuring each non-singleton index (derived from
- * `ivs`) stays within [0, extent) for the selected operand (source when
- * `src_dst==0`, destination otherwise). For each non-unit Range in the chosen
- * range list this produces two conditions:
- *   - range.min + iv >= 0
- *   - range.min + iv < extent
- *
- * Conditions that the analyzer can prove (with symbolic bounds) are omitted.
- * If no uncertain conditions remain, an empty PrimExpr is returned.
- *
- * Note: the function ICHECKs that `extents.size()` equals the number of ranges
- * for the selected operand.
- *
- * @param ivs Iteration variables corresponding to non-singleton extents (order
- *            matches the non-unit ranges of the chosen operand).
- * @param extents Per-dimension upper bounds to check against; must have the
- *                same size as the selected range list.
- * @param src_dst Selects which ranges to validate: 0 => `src_range`, else
- *                `dst_range`.
- * @return PrimExpr A conjunction of remaining (non-provable) bounds checks, or
- *         an empty PrimExpr when no checks are required.
- */
-PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
-                                      const Array<IterVar> &ivs,
-                                      Array<PrimExpr> extents,
-                                      int src_dst) const {
-  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
-  Array<PrimExpr> cond_list;
-  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
-  size_t idx = 0;
-  for (size_t i = 0; i < ranges.size(); i++) {
-    if (is_one(ranges[i]->extent))
-      continue;
-    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
-    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
-      cond_list.push_back(cond);
-    }
-    cond = ranges[i]->min + ivs[idx]->var >= 0;
-    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
-      cond_list.push_back(cond);
-    }
-    idx++;
-  }
-  if (cond_list.empty())
-    return {};
-  else {
-    PrimExpr cond = cond_list[0];
-    for (size_t i = 1; i < cond_list.size(); i++)
-      cond = And(cond, cond_list[i]);
-    return cond;
-  }
-}
-
 /**
  * @brief Build a SIMT-style loop nest that performs element-wise atomic
  * additions from src to dst.
@@ -236,8 +133,9 @@ PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
  * - Validates loop variable counts against src/dst ranges (ICHECK on mismatch).
  * - Computes indexed accesses and emits optional bound predicates;
  * out-of-bounds accesses are masked to zero when predicates are uncertain.
- * - Emits an extern `call_extern("AtomicAdd", address_of(dst_value),
- * src_value)` call wrapped in an Evaluate statement.
+ * - Emits an extern `call_intrin(op.Op.get("tl.atomic_add_elem_op"),
+ * address_of(dst_value), src_value), annotations)` call wrapped in an Evaluate
+ * statement.
  * - Wraps the body with a parallel For at each loop level. If `coalesced_width`
  * is defined it is attached as the "coalesced_width" annotation on each loop.
  *
@@ -250,97 +148,172 @@ PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
  */
 For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<IterVar> loop_vars = MakeIterVars();
-  bool is_scalar = loop_vars.empty();
-  if (is_scalar) {
-    return For(Var("i"), 0, 1, ForKind::kSerial,
-               BufferStore(dst, BufferLoad(src, {0}), {0}));
-  }
+  ICHECK(!loop_vars.empty()) << "MakeIterVars in AtomicOp should not return "
+                                "empty vars (at least 1 var)";
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
 
-  ICHECK(loop_vars.size() <= src_range.size())
-      << "loop_vars.size() = " << loop_vars.size()
-      << ", src_range.size() = " << src_range.size() << ", src = " << src->name
-      << ", dst = " << dst->name;
-
   ICHECK(loop_vars.size() <= dst_range.size())
       << "loop_vars.size() = " << loop_vars.size()
-      << ", dst_range.size() = " << dst_range.size() << ", src = " << src->name
-      << ", dst = " << dst->name;
+      << ", dst_range.size() = " << dst_range.size() << ", dst = " << dst->name;
 
-  Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
   Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+  Array<PrimExpr> new_args;
 
-  PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+  // Optional bounds predicates for src and dst
   PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
 
-  Array<PrimExpr> new_args;
-
-  PrimExpr src_value = BufferLoad(src, src_indices);
-  if (src->dtype != dst->dtype)
-    src_value = Cast(dst->dtype, src_value);
-  if (src_predicate.defined())
-    src_value = if_then_else(src_predicate, src_value, make_zero(dst->dtype));
+  // Src arg to be passed to the Call atomic operation
+  PrimExpr src_value_arg;
+
+  // If src is a Buffer
+  if (!src_value.defined()) {
+    ICHECK(loop_vars.size() <= src_range.size())
+        << "loop_vars.size() = " << loop_vars.size()
+        << ", src_range.size() = " << src_range.size()
+        << ", src = " << src->name << ", dst = " << dst->name;
+
+    Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+    PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+    // Load source value
+    src_value_arg = BufferLoad(src, src_indices);
+  } else {
+    src_value_arg = src_value;
+  }
+  // Cast to dst dtype if needed
+  if (src_value_arg->dtype != dst->dtype)
+    src_value_arg = Cast(dst->dtype, src_value_arg);
 
-  PrimExpr dst_value = BufferLoad(dst, dst_indices);
-  if (dst_predicate.defined())
-    dst_value = if_then_else(dst_predicate, dst_value, make_zero(dst->dtype));
+  // Build a pointer to destination element using tvm_access_ptr
+  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, dst_indices)});
 
-  new_args.push_back(dst_value);
-  new_args.push_back(src_value);
-  new_args.push_back(memory_order);
+  new_args.push_back(dst_ptr);
+  new_args.push_back(src_value_arg);
+  new_args.push_back(GetMemoryOrder());
 
+  // erase use_tma from annotations
+  auto annotations = this->annotations;
+  annotations.erase("use_tma");
   Call atomicadd_call =
-      tvm::tir::Call(dst->dtype, atomicadd_elem_op(), new_args);
+      tvm::tir::Call(dst->dtype, atomic_add_elem_op(), new_args, annotations);
 
   Stmt body = tvm::tir::Evaluate(atomicadd_call);
 
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
-    Map<String, ObjectRef> annotations = {};
-    if (coalesced_width.defined()) {
-      annotations.Set("coalesced_width", coalesced_width);
+    Map<String, ObjectRef> loop_annotations;
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
     }
 
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
-               ForKind::kParallel, body, std::nullopt, annotations);
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
   }
   return Downcast<For>(body);
 }
 
+/**
+ * @brief Compute linear layout for shared tensor (used in TMA atomic add).
+ *
+ * Creates a tiled layout that splits each dimension into blocks of 256
+ * elements. The layout maps [i, j, ...] to [i // 256, j // 256, ..., i % 256, j
+ * % 256, ...].
+ *
+ * @param shared_tensor The shared memory buffer to compute layout for.
+ * @return Layout A tiled linear layout for the buffer.
+ */
+Layout AtomicAddNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
+  Array<PrimExpr> input_size = shared_tensor->shape;
+  Array<PrimExpr> forward_vars;
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_vars.push_back(InputPlaceholder(i));
+  }
+  // [i, j] -> [i // 256, j // 256, i % 256, j % 256]
+  Array<PrimExpr> forward_index;
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_index.push_back(FloorDiv(forward_vars[i], 256));
+  }
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_index.push_back(FloorMod(forward_vars[i], 256));
+  }
+  return Layout(input_size, forward_index);
+}
+
 /**
  * @brief Infer and return the layout map for the atomic add operator.
  *
- * Constructs a cached ParallelOp (by building the SIMT loop) if not already
- * present, validates that local.fragment layouts for src and dst match when
- * both are provided, and then delegates layout inference to the underlying
- * ParallelOp.
+ * For TMA atomic add operations (when use_tma=True):
+ *   - src is always shared memory, dst is always global memory
+ *   - Automatically applies swizzle layout to the shared memory buffer when
+ *     the operation is not 1D, improving memory access efficiency
+ *
+ * For non-TMA atomic add operations:
+ *   - Returns empty layout map (no layout inference needed)
  *
  * @param T Layout inference inputs, including an optional mapping of buffers to
  * layouts.
  * @param level Inference strictness level.
  * @return LayoutMap The inferred layout mapping for buffers used by this
  * operator.
- *
- * @note This method mutates the AtomicAddNode by creating and storing a
- * ParallelOp on first invocation.
- * @throws If both src and dst have layouts in `local.fragment` and their
- * fragment layouts differ, an ICHECK failure is raised with diagnostic output.
  */
 LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
                                      InferLevel level) const {
-  if (T.layout_map.count(src) && T.layout_map.count(dst)) {
-    if (src.scope() == "local.fragment" && dst.scope() == "local.fragment") {
-      const FragmentNode *src_layout = T.layout_map[src].as<FragmentNode>();
-      const FragmentNode *dst_layout = T.layout_map[dst].as<FragmentNode>();
-      if (src_layout && dst_layout) {
-        ICHECK(src_layout->IsEqual(dst_layout, true))
-            << "Get different layout for " << src << " and " << dst
-            << "\nLHS = " << src_layout->DebugOutput()
-            << "\nRHS = " << dst_layout->DebugOutput()
-            << "\nYou may need to use a shared memory to transform the layout";
+  // Handle TMA atomic add layout inference
+  if (GetUseTMA()) {
+    Map<Buffer, Layout> result_map;
+
+    // For TMA atomic add: src is shared memory, dst is global memory
+    Buffer shared_tensor = src;
+    Array<Range> shared_range = src_range;
+
+    // Check if this is 1D TMA
+    bool is_tma_1d = shared_range.size() == 1;
+
+    if (is_tma_1d) {
+      // 1D TMA atomic add with single dimension cannot be swizzled
+      return result_map;
+    }
+
+    // For non-1D TMA atomic add, apply swizzle layout if possible
+    if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
+      // TMA atomic add is similar to TMA Store - we should perform swizzle if
+      // possible Use the last two dimensions to analyze swizzling
+      int dim = shared_tensor->shape.size();
+      const int64_t mat_stride = *as_const_int(shared_tensor->shape[dim - 2]);
+      const int64_t mat_continuous =
+          *as_const_int(shared_tensor->shape[dim - 1]);
+      Layout swizzle_layout =
+          makeGemmABLayoutHopper(mat_stride, mat_continuous, mat_continuous,
+                                 shared_tensor->dtype.bits(), /*k_inner=*/true);
+      // If makeGemmABLayoutHopper returns a linear layout, fallback to
+      // ComputeLinearLayout which handles arbitrary tensor shapes correctly.
+      if (StructuralEqual()(swizzle_layout, makeLinearLayout(Array<PrimExpr>{
+                                                Integer(mat_stride),
+                                                Integer(mat_continuous)}))) {
+        result_map.Set(shared_tensor, ComputeLinearLayout(shared_tensor));
+      } else {
+        result_map.Set(shared_tensor, swizzle_layout);
       }
     }
+
+    return result_map;
+  }
+
+  // For non-TMA atomic add, check that src and dst have the same layout if both
+  // are fragments
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
+    if (T.layout_map.count(src) && T.layout_map.count(dst)) {
+      Layout src_layout = T.layout_map.at(src);
+      Layout dst_layout = T.layout_map.at(dst);
+      ICHECK(StructuralEqual()(src_layout, dst_layout))
+          << "AtomicAdd requires src and dst to have the same layout, but got "
+          << "src layout: " << src_layout << ", dst layout: " << dst_layout
+          << " for src buffer: " << src->name << ", dst buffer: " << dst->name;
+    }
   }
   return {};
 }
@@ -382,169 +355,244 @@ LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
  */
 Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
-  if (use_tma->value != 0) {
-    Array<PrimExpr> src_indices, dst_indices;
-    PrimExpr src_size, dst_size;
-    std::tie(src_indices, src_size) = ReturnIndicesAndSize(0);
-    std::tie(dst_indices, dst_size) = ReturnIndicesAndSize(1);
-    ICHECK(analyzer->CanProveEqual(src_size, dst_size))
-        << "src_size = " << src_size << ", dst_size = " << dst_size;
-    BufferLoad src_node = BufferLoad(src, src_indices);
-    BufferLoad dst_node = BufferLoad(dst, dst_indices);
-    Call address_of_src =
-        Call(DataType::Handle(), builtin::address_of(), {src_node});
-    Call address_of_dst =
-        Call(DataType::Handle(), builtin::address_of(), {dst_node});
-
-    int need_reduce = 1;
-    int eviction_policy = 0;
-    auto body = Evaluate(Call(DataType::Handle(), tma_store(),
-                              {address_of_src, address_of_dst,
-                               ceildiv(src_size * src->dtype.bits(), 8),
-                               need_reduce, eviction_policy}));
-    return IfThenElse(EQ(T.thread_var, T.thread_bounds->min), body);
-  }
-  auto simt_loop = MakeSIMTLoop(analyzer);
-  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
-  auto transformed_loop =
-      Downcast<For>(ParallelLoopTransformer::Substitute(fused_loop));
-
-  auto GetArchInt = [&](const Target &tgt) -> int {
-    int arch_int = 0;
-    if (auto s = tgt->GetAttr<String>("arch")) {
-      std::string arch = s.value();
-      if (arch.rfind("sm_", 0) == 0)
-        arch_int = std::stoi(arch.substr(3));
-    }
-    return arch_int;
-  };
-
-  struct AtomicLoopNestCollector : tir::StmtExprVisitor {
-    Array<IterVar> loop_vars;
-    Map<Buffer, Array<PrimExpr>> indice_map;
-    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> writes;
-    arith::Analyzer analyzer;
-
-    void Run(const Stmt &s) { StmtExprVisitor::VisitStmt(s); }
-
-    void VisitStmt_(const ForNode *op) final {
-      if (op->kind == ForKind::kParallel) {
-        loop_vars.push_back(IterVar(Range(op->min, op->extent), op->loop_var,
-                                    IterVarType::kDataPar));
+  if (GetUseTMA()) {
+    // For AtomicAdd with TMA: src is shared memory, dst is global memory
+    // Use cp.reduce.async.bulk.tensor instruction with tensor descriptor
+    Buffer shared_tensor = src;
+    Buffer global_tensor = dst;
+    Array<Range> shared_range = src_range;
+    Array<Range> global_range = dst_range;
+
+    // Build TMADesc for the global tensor
+    TMADesc desc;
+    desc.rank = global_tensor->shape.size();
+    ICHECK(desc.rank >= 1 && desc.rank <= 5)
+        << "TMA reduce only supports 1-5 dimensions, got " << desc.rank;
+
+    // Data type must match
+    ICHECK(global_tensor->dtype == shared_tensor->dtype)
+        << "AtomicAdd between buffer " << shared_tensor->name << " and "
+        << global_tensor->name << " with different data type "
+        << shared_tensor->dtype << " and " << global_tensor->dtype;
+
+    desc.data_type = to_CUtensorMapDataType(global_tensor->dtype);
+
+    // Global tensor shape and stride
+    desc.global_addr = global_tensor->data;
+    desc.global_shape = ReverseArray(global_tensor->shape);
+    Array<PrimExpr> global_coords =
+        ReverseArray(global_range.Map([](Range r) { return r->min; }));
+
+    if (!global_tensor->strides.empty()) {
+      desc.global_stride = ReverseArray(global_tensor->strides);
+    } else {
+      // Create stride from shape (row-major)
+      PrimExpr stride = 1;
+      desc.global_stride.reserve(desc.rank);
+      for (size_t i = 0; i < desc.rank; i++) {
+        desc.global_stride.push_back(stride);
+        stride *= desc.global_shape[i];
       }
-      analyzer.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-      StmtExprVisitor::VisitStmt_(op);
     }
-    void VisitStmt_(const BufferStoreNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
-        indice_map.Set(op->buffer, op->indices);
-        writes.insert(op->buffer);
-      }
-      StmtExprVisitor::VisitStmt_(op);
+    // Make global stride in bytes
+    desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
+      return cast(DataType::Int(64), e) * global_tensor->dtype.bytes();
+    });
+
+    // Shared memory box (copy extent)
+    desc.smem_box =
+        ReverseArray(global_range.Map([](Range r) { return r->extent; }));
+    desc.smem_stride = Array<PrimExpr>(desc.rank, PrimExpr(1));
+
+    // L2 & OOB settings
+    desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
+    desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+
+    // Detect smem layout for swizzle (similar to copy.cc)
+    // linear layout must be computed before remapping
+    auto linear_layout = makeLinearLayout(shared_tensor->shape);
+    desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
+    Layout shared_layout;
+    if (T.layout_map.count(shared_tensor)) {
+      shared_layout = T.layout_map.at(shared_tensor);
+      ICHECK(T.buffer_remap.count(shared_tensor))
+          << "shared_tensor: " << shared_tensor->name
+          << " not found in buffer_remap";
+      shared_tensor = T.buffer_remap.at(shared_tensor);
     }
-    void VisitExpr_(const BufferLoadNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
-        indice_map.Set(op->buffer, op->indices);
+    if (!shared_layout.defined()) {
+      desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+    } else if (StructuralEqual()(shared_layout, linear_layout)) {
+      desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+    } else {
+      ICHECK(shared_layout->InputDim() == 2) << "Cannot detect TMA layout.";
+      auto stride = as_const_int(shared_layout->InputShape()[0]);
+      auto continuous = as_const_int(shared_layout->InputShape()[1]);
+      ICHECK(stride != nullptr && continuous != nullptr);
+      if (StructuralEqual()(shared_layout, makeQuarterBankSwizzleLayout(
+                                               *stride, *continuous,
+                                               shared_tensor->dtype.bits()))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeHalfBankSwizzleLayout(*stride, *continuous,
+                                               shared_tensor->dtype.bits()))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeFullBankSwizzleLayout(*stride, *continuous,
+                                               shared_tensor->dtype.bits()))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeGemmABLayoutPadded(*stride, *continuous,
+                                            shared_tensor->dtype.bits()))) {
+        LOG(WARNING) << "AtomicAdd TMA cannot support a padded layout for src: "
+                     << src->name << ", dst: " << dst->name;
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+      } else {
+        LOG(WARNING) << "AtomicAdd TMA unsupported swizzle layout for src: "
+                     << src->name << ", dst: " << dst->name;
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
       }
-      StmtExprVisitor::VisitExpr_(op);
     }
-  };
-
-  auto ComputeLoopLayoutFromBuffer =
-      [&](const Buffer &buf, const Array<PrimExpr> &indices,
-          const LayoutMap &layout_map, const Range &thread_bounds,
-          const Array<IterVar> &loop_vars) -> Fragment {
-    Fragment src = layout_map[buf].as<Fragment>().value();
-    Var rep;
-    auto rep_iter =
-        IterVar(Range(0, src->ReplicateExtent()), rep, IterVarType::kDataPar);
-    PrimExpr fth = src->ForwardThread(indices, rep);
-    fth = analyzer->Simplify(fth);
-    Fragment out = Fragment(loop_vars, /*forward_index=*/{}, fth, rep_iter)
-                       ->BindThreadRange(thread_bounds);
-    return out;
-  };
-
-  struct AtomicInferResult {
-    Fragment loop_layout;
-    Optional<PrimExpr> predicate;
-  };
-
-  auto AtomicAddInferLayout =
-      [&](const For &loop, const LayoutInferArgs &args) -> AtomicInferResult {
-    AtomicLoopNestCollector C;
-    C.Run(loop);
-    Optional<Buffer> read_src;
-    int best_rank = -1;
-    for (auto kv : C.indice_map) {
-      const Buffer &buf = kv.first;
-      if (buf.scope() != "local.fragment")
-        continue;
-      if (!args.layout_map.count(buf))
-        continue;
-      int rank = static_cast<int>(kv.second.size());
-      if (rank > best_rank) {
-        best_rank = rank;
-        read_src = buf;
-      }
+
+    // Adjust instruction_dim based on swizzle type (similar to copy.cc)
+    auto inner_box_dim = as_const_int(desc.smem_box[0]);
+    ICHECK(inner_box_dim != nullptr)
+        << "inner_box_dim must be a constant integer for TMA atomic add";
+    int instruction_dim = *inner_box_dim;
+    if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B)) {
+      instruction_dim = 64 / shared_tensor->dtype.bytes();
+    } else if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B)) {
+      instruction_dim = 128 / shared_tensor->dtype.bytes();
     }
-    AtomicAddVectorizePlanner planner;
-    int sm = GetArchInt(target);
-    auto plan = planner.Plan(loop, sm);
-    int vec = std::max(plan.vector_size, 1);
-    if (auto cw = loop->annotations.Get("coalesced_width")) {
-      if (const auto *imm = cw->as<IntImmNode>()) {
-        int expected = imm->value;
-        ICHECK_GT(expected, 0);
-        ICHECK(vec % expected == 0)
-            << "vector_size " << vec << " not divisible by coalesced_width "
-            << expected;
-        vec = expected;
-      } else {
-        LOG(FATAL) << "coalesced_width should be IntImmNode.";
+    if (instruction_dim > 256) {
+      ICHECK((*inner_box_dim) % 256 == 0)
+          << "inner_box_dim: " << *inner_box_dim << " is not divisible by 256";
+      instruction_dim = 256;
+    }
+    ICHECK((*inner_box_dim) % instruction_dim == 0)
+        << "inner_box_dim: " << *inner_box_dim
+        << " is not divisible by instruction_dim: " << instruction_dim;
+    desc.smem_box.Set(0, PrimExpr(instruction_dim));
+
+    int inner_box_dim_ = instruction_dim * shared_tensor->dtype.bytes();
+    // Check inner_box_dim_ for each swizzle type
+    struct SwizzleCheck {
+      int swizzle;
+      int max_dim;
+    };
+    static const std::vector<SwizzleCheck> swizzle_checks = {
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B), 32},
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B), 64},
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B), 128},
+    };
+    for (const auto &check : swizzle_checks) {
+      if (desc.swizzle == check.swizzle && inner_box_dim_ > check.max_dim) {
+        LOG(WARNING) << "AtomicAdd TMA cannot support swizzled layout with "
+                        "inner_box_dim_ > "
+                     << check.max_dim;
       }
     }
-    PrimExpr total = 1;
-    for (Stmt s = loop; s.as<For>().has_value(); s = s.as<For>().value()->body)
-      total = total * s.as<For>().value()->extent;
-    PrimExpr denom = args.thread_bounds->extent * vec;
-    while (!analyzer->CanProve(floormod(total, denom) == 0) && vec > 1) {
-      vec >>= 1;
-      denom = args.thread_bounds->extent * vec;
+
+    // Compute shared memory offset
+    Array<PrimExpr> shared_indices;
+    for (auto r : shared_range)
+      shared_indices.push_back(r->min);
+    std::vector<PrimExpr> shared_strides;
+    PrimExpr shared_stride = 1;
+    for (size_t i = 0; i < shared_tensor->shape.size(); i++) {
+      auto s = shared_tensor->shape[shared_tensor->shape.size() - i - 1];
+      shared_strides.insert(shared_strides.begin(), shared_stride);
+      shared_stride *= s;
     }
-    if (vec < 1)
-      vec = 1;
-    Fragment loop_layout;
-    if (read_src) {
-      loop_layout = ComputeLoopLayoutFromBuffer(
-          read_src.value(), C.indice_map[read_src.value()], args.layout_map,
-          args.thread_bounds, C.loop_vars);
-    } else {
-      const For &remapped = loop;
-      loop_layout = PlanLoopPartition(remapped, vec, args.thread_bounds);
+    PrimExpr shared_offset = 0;
+    for (size_t i = 0; i < shared_indices.size(); i++) {
+      shared_offset += shared_indices[i] * shared_strides[i];
     }
 
-    Optional<PrimExpr> pred;
-    if (plan.dynamic && plan.condition.defined()) {
-      pred = plan.condition;
+    // Create TMA descriptor
+    Call create_descriptor = Call(DataType::Handle(), create_tma_descriptor(),
+                                  desc.EncodeCallArgs());
+
+    // Compute total elements for access_ptr
+    PrimExpr total_elements = 1;
+    for (auto e : desc.smem_box)
+      total_elements *= e;
+
+    // erase use_tma from annotations
+    auto op_annotations = this->annotations;
+    op_annotations.erase("use_tma");
+
+    Stmt tma_reduce;
+    if ((*inner_box_dim) != instruction_dim) {
+      // Need to split the operation into multiple TMA calls
+      Var loop_var("i");
+      int loop_extent = (*inner_box_dim) / instruction_dim;
+
+      Array<PrimExpr> args;
+      args.reserve(desc.rank + 4);
+      args.push_back(create_descriptor);
+      PrimExpr shared_addr = shared_tensor.access_ptr(
+          1, DataType::Handle(), 1, shared_offset + total_elements * loop_var,
+          total_elements);
+      args.push_back(shared_addr);
+      Array<PrimExpr> loop_global_coords = global_coords;
+      loop_global_coords.Set(0, global_coords[0] + instruction_dim * loop_var);
+      for (auto coord : loop_global_coords)
+        args.push_back(coord);
+      int need_reduce = 1;
+      args.push_back(need_reduce);
+      int eviction_policy = 0;
+      args.push_back(eviction_policy);
+      tma_reduce = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
+                       Evaluate(Call(DataType::Handle(), tma_store(), args,
+                                     op_annotations)));
+    } else {
+      Array<PrimExpr> args;
+      args.reserve(desc.rank + 4);
+      args.push_back(create_descriptor);
+      PrimExpr shared_addr = shared_tensor.access_ptr(
+          1, DataType::Handle(), 1, shared_offset, total_elements);
+      args.push_back(shared_addr);
+      for (auto coord : global_coords)
+        args.push_back(coord);
+      int need_reduce = 1;
+      args.push_back(need_reduce);
+      int eviction_policy = 0;
+      args.push_back(eviction_policy);
+      tma_reduce =
+          Evaluate(Call(DataType::Handle(), tma_store(), args, op_annotations));
     }
-    DLOG(INFO) << "[AtomicAddInferLayout] vec=" << vec
-               << " loop_layout=" << loop_layout->DebugOutput();
-    return {loop_layout, pred};
-  };
-
-  auto ret = AtomicAddInferLayout(transformed_loop,
-                                  {T.target, T.thread_bounds, T.layout_map,
-                                   analyzer, false, T.buffer_remap});
-  Fragment loop_layout = ret.loop_layout;
-  auto thread_loop =
-      PartitionLoop(transformed_loop, T.thread_var, analyzer, loop_layout);
-  auto vectorized_thread_loop =
-      VectorizeAtomicAdd(thread_loop, GetArchInt(target));
-  return vectorized_thread_loop;
+
+    return IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_reduce);
+  }
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto par_op = ParallelOp(fused_loop);
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  // 1.give par_op a recommended vectorize size. (only works for free layout
+  // inference).
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+  auto loop_layout = par_op->GetLoopLayout();
+  auto lowered_loop =
+      LowerParallelLoop(fused_loop, loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+  return lowered_loop;
 }
 
-TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
+TIR_REGISTER_TL_TILE_OP(AtomicAdd, atomicadd)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -552,4 +600,4 @@ TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
 TVM_FFI_STATIC_INIT_BLOCK() { AtomicAddNode::RegisterReflection(); }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
index f3aaacdbe..ed60f267d 100644
--- a/src/op/atomic_add.h
+++ b/src/op/atomic_add.h
@@ -6,58 +6,70 @@
 #ifndef TVM_TL_OP_ATOMIC_ADD_H_
 #define TVM_TL_OP_ATOMIC_ADD_H_
 
-#include "operator.h"
-#include "parallel.h"
+#include "atomic_reduce.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-/// Node class for atomic addition operations
-class AtomicAddNode : public TileOperatorNode {
+/*!
+ * \brief Node class for atomic addition operations.
+ *
+ * Inherits from AtomicOpBaseNode and adds TMA support and vectorization.
+ */
+class AtomicAddNode : public AtomicOpBaseNode {
 public:
-  Buffer src, dst; ///< Source and destination buffers
-  Array<Range> src_range,
-      dst_range;          ///< Access ranges for source and destination
-  IntImm use_tma;         ///< Whether to use TMA for memory operations
-  IntImm coalesced_width; ///< Width for memory coalescing optimization
-  IntImm memory_order;    ///< Memory order for atomic operations
-
-  mutable ParallelOp par_op_; ///< Associated parallel operation
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicAdd", AtomicAddNode,
                                     TileOperatorNode);
 
+  /// Override Lower to add TMA support
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
+  /// Override InferLayout to add TMA layout inference
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
 
   static const Op &Get();
+  const Op &GetElemOp() const override;
   TileOperator Clone() const;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<AtomicAddNode>()
         .def_ro("src", &AtomicAddNode::src)
+        .def_ro("src_value", &AtomicAddNode::src_value)
         .def_ro("dst", &AtomicAddNode::dst)
         .def_ro("src_range", &AtomicAddNode::src_range)
         .def_ro("dst_range", &AtomicAddNode::dst_range)
-        .def_ro("use_tma", &AtomicAddNode::use_tma)
-        .def_ro("coalesced_width", &AtomicAddNode::coalesced_width)
-        .def_ro("memory_order", &AtomicAddNode::memory_order);
+        .def_ro("annotations", &AtomicAddNode::annotations);
   }
 
+  /// Check if TMA should be used
+  bool GetUseTMA() const {
+    if (auto val = annotations.Get("use_tma")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        if (int_val->value != 0) {
+          ICHECK(!src_value.defined())
+              << "TMA is not supported when using TiledAtomicAdd with PrimExpr "
+                 "as value.";
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  /// Get vectorization length based on dst dtype and target SM version
+  int GetVectorizeLength(Target target) const;
+
 protected:
-  /// Create SIMT-style parallel loop structure
+  /// Override MakeSIMTLoop to handle AtomicAdd-specific logic
   For MakeSIMTLoop(arith::Analyzer *analyzer) const;
-  /// Generate iteration variables for loop nest
-  Array<IterVar> MakeIterVars() const;
-  /// Generate buffer indices from iteration variables
-  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
-  /// Return buffer indices and size
+
+  /// Return buffer indices and total size
   std::pair<Array<PrimExpr>, PrimExpr> ReturnIndicesAndSize(int src_dst) const;
-  /// Create boundary predicate for memory safety
-  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
-                         Array<PrimExpr> extents, int src_dst) const;
+  /// Compute linear layout for shared tensor (used in TMA atomic add)
+  Layout ComputeLinearLayout(const Buffer &shared_tensor) const;
 };
 
 /// Wrapper class for atomic addition operations
@@ -65,11 +77,13 @@ class AtomicAdd : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicAdd, TileOperator,
                                              AtomicAddNode);
-  TVM_DLL AtomicAdd(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL
+  AtomicAdd(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_ATOMIC_ADD_H_
+#endif // TVM_TL_OP_ATOMIC_ADD_H_
diff --git a/src/op/atomic_reduce.cc b/src/op/atomic_reduce.cc
new file mode 100644
index 000000000..925572d5f
--- /dev/null
+++ b/src/op/atomic_reduce.cc
@@ -0,0 +1,304 @@
+/*!
+ * \file tl/op/atomic_reduce.cc
+ *
+ * Define atomic reduction operators (max/min).
+ */
+
+#include "./atomic_reduce.h"
+#include "utils.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "../layout/layout.h"
+#include "../target/utils.h"
+
+#include "../transform/common/loop_fusion_utils.h"
+#include "../transform/loop_partition.h"
+#include "builtin.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// ============================================================================
+// AtomicMax Implementation
+// ============================================================================
+
+AtomicMax::AtomicMax(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicMax expects at least 2 arguments (src, dst), got "
+      << args.size();
+  ObjectPtr<AtomicMaxNode> node = tvm::ffi::make_object<AtomicMaxNode>();
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto region = NormalizeToBufferRegion(args[0]);
+    node->src = region->buffer;
+    node->src_range = region->region;
+  } else {
+    node->src_value = args[0];
+  }
+
+  auto region = NormalizeToBufferRegion(args[1]);
+  node->dst = region->buffer;
+  node->dst_range = region->region;
+
+  node->annotations = annotations;
+  data_ = std::move(node);
+}
+
+TileOperator AtomicMaxNode::Clone() const {
+  auto op = tvm::ffi::make_object<AtomicMaxNode>(*this);
+  if (par_op_.defined()) {
+    op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
+  }
+  return AtomicMax(op);
+}
+
+const Op &AtomicMaxNode::GetElemOp() const { return atomic_max_elem_op(); }
+
+// ============================================================================
+// AtomicMin Implementation
+// ============================================================================
+
+AtomicMin::AtomicMin(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicMin expects at least 2 arguments (src, dst), got "
+      << args.size();
+  ObjectPtr<AtomicMinNode> node = tvm::ffi::make_object<AtomicMinNode>();
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto region = NormalizeToBufferRegion(args[0]);
+    node->src = region->buffer;
+    node->src_range = region->region;
+  } else {
+    node->src_value = args[0];
+  }
+
+  auto region = NormalizeToBufferRegion(args[1]);
+  node->dst = region->buffer;
+  node->dst_range = region->region;
+
+  node->annotations = annotations;
+  data_ = std::move(node);
+}
+
+TileOperator AtomicMinNode::Clone() const {
+  auto op = tvm::ffi::make_object<AtomicMinNode>(*this);
+  if (par_op_.defined()) {
+    op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
+  }
+  return AtomicMin(op);
+}
+
+const Op &AtomicMinNode::GetElemOp() const { return atomic_min_elem_op(); }
+
+// ============================================================================
+// Common AtomicOpBaseNode Implementation
+// ============================================================================
+
+Array<IterVar> AtomicOpBaseNode::MakeIterVars() const {
+  Array<IterVar> loop_vars;
+  size_t idx = 0;
+  // Make IterVars according to dst, not src
+  // Since src may be a scalar Expr
+  for (size_t i = 0; i < dst_range.size(); i++) {
+    if (is_one(dst_range[i]->extent))
+      continue;
+    Var var = Var(std::string{char('i' + idx)}, dst_range[i]->extent->dtype);
+    idx++;
+    loop_vars.push_back(
+        {Range(0, dst_range[i]->extent), var, IterVarType::kDataPar});
+  }
+
+  // If is scalar, create a dummy loop var
+  if (loop_vars.empty()) {
+    Var var = Var("i");
+    loop_vars.push_back({Range(0, 1), var, IterVarType::kDataPar});
+  }
+
+  return loop_vars;
+}
+
+Array<PrimExpr> AtomicOpBaseNode::MakeIndices(const Array<IterVar> &ivs,
+                                              int src_dst) const {
+  Array<PrimExpr> indices;
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      indices.push_back(ranges[i]->min);
+    else {
+      indices.push_back(ranges[i]->min + ivs[idx]->var);
+      idx++;
+    }
+  }
+
+  // Special case: scalar range, when there is one var and one range(0, 1)
+  ICHECK(idx == ivs.size() || (idx == 0 && ivs.size() == 1))
+      << "Unmatched indices: idx = " << idx << ", ivs.size() = " << ivs.size()
+      << ", dst name = " << dst->name;
+  return indices;
+}
+
+PrimExpr AtomicOpBaseNode::MakePredicate(arith::Analyzer *analyzer,
+                                         const Array<IterVar> &ivs,
+                                         Array<PrimExpr> extents,
+                                         int src_dst) const {
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  Array<PrimExpr> cond_list;
+  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      continue;
+    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    cond = ranges[i]->min + ivs[idx]->var >= 0;
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    idx++;
+  }
+  if (cond_list.empty())
+    return {};
+  else {
+    PrimExpr cond = cond_list[0];
+    for (size_t i = 1; i < cond_list.size(); i++)
+      cond = And(cond, cond_list[i]);
+    return cond;
+  }
+}
+
+For AtomicOpBaseNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
+  Array<IterVar> loop_vars = MakeIterVars();
+  ICHECK(!loop_vars.empty()) << "MakeIterVars in AtomicOp should not return "
+                                "empty vars (at least 1 var)";
+
+  for (const auto &iv : loop_vars)
+    analyzer->Bind(iv->var, iv->dom);
+
+  ICHECK(loop_vars.size() <= dst_range.size())
+      << "loop_vars.size() = " << loop_vars.size()
+      << ", dst_range.size() = " << dst_range.size() << ", dst = " << dst->name;
+
+  Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+  Array<PrimExpr> new_args;
+
+  // Src arg to be passed to the Call atomic operation
+  PrimExpr src_value_arg;
+
+  // If src is a Buffer
+  if (!src_value.defined()) {
+    ICHECK(loop_vars.size() <= src_range.size())
+        << "loop_vars.size() = " << loop_vars.size()
+        << ", src_range.size() = " << src_range.size()
+        << ", src = " << src->name << ", dst = " << dst->name;
+
+    Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+    // Load source value
+    src_value_arg = BufferLoad(src, src_indices);
+  } else {
+    src_value_arg = src_value;
+  }
+  // Cast to dst dtype if needed
+  if (src_value_arg->dtype != dst->dtype)
+    src_value_arg = Cast(dst->dtype, src_value_arg);
+
+  // Build a pointer to destination element using tvm_access_ptr
+  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, dst_indices)});
+
+  new_args.push_back(dst_ptr);
+  new_args.push_back(src_value_arg);
+  new_args.push_back(GetMemoryOrder());
+
+  // Use the appropriate elem_op based on the derived type (via virtual call)
+  Call atomic_call =
+      tvm::tir::Call(dst->dtype, GetElemOp(), new_args, annotations);
+
+  Stmt body = tvm::tir::Evaluate(atomic_call);
+
+  for (int i = loop_vars.size() - 1; i >= 0; i--) {
+    Map<String, ObjectRef> loop_annotations;
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
+    }
+
+    body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
+  }
+  return Downcast<For>(body);
+}
+
+LayoutMap AtomicOpBaseNode::InferLayout(const LayoutInferArgs &T,
+                                        InferLevel level) const {
+  // For atomic reduce operations, check that src and dst have the same layout
+  // if both are fragments
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
+    if (T.layout_map.count(src) && T.layout_map.count(dst)) {
+      Layout src_layout = T.layout_map.at(src);
+      Layout dst_layout = T.layout_map.at(dst);
+      ICHECK(StructuralEqual()(src_layout, dst_layout))
+          << "Atomic reduce requires src and dst to have the same layout, but "
+             "got "
+          << "src layout: " << src_layout << ", dst layout: " << dst_layout
+          << " for src buffer: " << src->name << ", dst buffer: " << dst->name;
+    }
+  }
+  return {};
+}
+
+Stmt AtomicOpBaseNode::Lower(const LowerArgs &T,
+                             arith::Analyzer *analyzer) const {
+  Target target = T.target;
+
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto par_op = ParallelOp(fused_loop);
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+  auto loop_layout = par_op->GetLoopLayout();
+  auto lowered_loop =
+      LowerParallelLoop(fused_loop, loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+  return lowered_loop;
+}
+
+// ============================================================================
+// Operator Registration
+// ============================================================================
+
+TIR_REGISTER_TL_TILE_OP(AtomicMax, atomicmax)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_REGISTER_TL_TILE_OP(AtomicMin, atomicmin)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  AtomicMaxNode::RegisterReflection();
+  AtomicMinNode::RegisterReflection();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/atomic_reduce.h b/src/op/atomic_reduce.h
new file mode 100644
index 000000000..57b13e139
--- /dev/null
+++ b/src/op/atomic_reduce.h
@@ -0,0 +1,140 @@
+/*!
+ * \file tl/op/atomic_reduce.h
+ * \brief Atomic operations base class and reduction operations (max/min)
+ */
+
+#ifndef TVM_TL_OP_ATOMIC_REDUCE_H_
+#define TVM_TL_OP_ATOMIC_REDUCE_H_
+
+#include "operator.h"
+#include "parallel.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Base node class for atomic operations (add/max/min).
+ *
+ * This base class provides common functionality for all atomic
+ * operations including buffer management, loop generation, and layout
+ * inference.
+ */
+class AtomicOpBaseNode : public TileOperatorNode {
+public:
+  PrimExpr src_value; ///< Source values, for cases src is not a buffer
+  Buffer src, dst;    ///< Source and destination buffers
+  Array<Range> src_range,
+      dst_range; ///< Access ranges for source and destination
+  Map<String, ObjectRef> annotations; ///< Annotations for the atomic operation
+  // Supported annotation keys:
+  //   - "coalesced_width": IntImm, width for memory coalescing optimization
+  //   - "memory_order": IntImm, memory order for atomic operations
+
+  mutable ParallelOp par_op_; ///< Associated parallel operation
+
+  /// Default Lower implementation for non-TMA atomic ops
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
+  /// Default InferLayout implementation
+  LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
+
+  /// Get memory order from annotations (default: relaxed = 0)
+  int GetMemoryOrder() const {
+    if (auto val = annotations.Get("memory_order")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value;
+      }
+    }
+    return 0;
+  }
+
+  /// Get the element-wise operation Op (pure virtual, implemented by derived)
+  virtual const Op &GetElemOp() const = 0;
+
+protected:
+  /// Create SIMT-style parallel loop structure
+  For MakeSIMTLoop(arith::Analyzer *analyzer) const;
+
+  /// Generate iteration variables for loop nest
+  Array<IterVar> MakeIterVars() const;
+
+  /// Generate buffer indices from iteration variables
+  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
+
+  /// Create boundary predicate for memory safety
+  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
+                         Array<PrimExpr> extents, int src_dst) const;
+};
+
+/// Node class for atomic maximum operations
+class AtomicMaxNode : public AtomicOpBaseNode {
+public:
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicMax", AtomicMaxNode,
+                                    TileOperatorNode);
+
+  static const Op &Get();
+  const Op &GetElemOp() const override;
+  TileOperator Clone() const;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<AtomicMaxNode>()
+        .def_ro("src", &AtomicMaxNode::src)
+        .def_ro("src_value", &AtomicMaxNode::src_value)
+        .def_ro("dst", &AtomicMaxNode::dst)
+        .def_ro("src_range", &AtomicMaxNode::src_range)
+        .def_ro("dst_range", &AtomicMaxNode::dst_range)
+        .def_ro("annotations", &AtomicMaxNode::annotations);
+  }
+};
+
+/// Wrapper class for atomic maximum operations
+class AtomicMax : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicMax, TileOperator,
+                                             AtomicMaxNode);
+  TVM_DLL
+  AtomicMax(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+/// Node class for atomic minimum operations
+class AtomicMinNode : public AtomicOpBaseNode {
+public:
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicMin", AtomicMinNode,
+                                    TileOperatorNode);
+
+  static const Op &Get();
+  const Op &GetElemOp() const override;
+  TileOperator Clone() const;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<AtomicMinNode>()
+        .def_ro("src", &AtomicMinNode::src)
+        .def_ro("src_value", &AtomicMinNode::src_value)
+        .def_ro("dst", &AtomicMinNode::dst)
+        .def_ro("src_range", &AtomicMinNode::src_range)
+        .def_ro("dst_range", &AtomicMinNode::dst_range)
+        .def_ro("annotations", &AtomicMinNode::annotations);
+  }
+};
+
+/// Wrapper class for atomic minimum operations
+class AtomicMin : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicMin, TileOperator,
+                                             AtomicMinNode);
+  TVM_DLL
+  AtomicMin(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_ATOMIC_REDUCE_H_
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index e7e86f2f5..7bbc44605 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -10,7 +10,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
-#include "../target/cuda.h"
+#include "../target/stubs/cuda.h"
 #include "../target/utils.h"
 
 namespace tvm {
@@ -22,8 +22,6 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableSafeMemoryLegalize, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWarpSpecialized, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableThreadStorageSync, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kConfigIndexBitwidth, Integer);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDynamicTailSplit, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDynamicAlignment, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kForceLetInline, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
@@ -31,9 +29,20 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kEnableFastMath, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kPtxasRegisterUsageLevel, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnablePTXASVerboseOutput, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableVectorizePlannerVerbose, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kASTPrintEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDeviceCompileFlags, ffi::Array<ffi::String>);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDataRaceCheck, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableLowerLDGSTG, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableLowerLDGSTGPredicated, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableLoopUnswitching, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLoopUnswitchingAllowNonTrivialElse, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableOutOfBoundWarning, Bool);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
@@ -99,6 +108,17 @@ TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
 TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(rng_rand).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(rng_rand_float)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -195,6 +215,11 @@ TIR_DEFINE_TL_BUILTIN(ptx_cp_async_barrier_noinc)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_cp_async)
+    .set_num_inputs(-1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(fence_proxy_async)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -270,6 +295,17 @@ TIR_DEFINE_TL_BUILTIN(pack_b16).set_num_inputs(2).set_attr<TCallEffectKind>(
 TIR_DEFINE_TL_BUILTIN(sync_grid).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(sync_warp).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(pdl_trigger)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(pdl_sync).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(loop_break)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -321,7 +357,52 @@ TIR_DEFINE_TL_BUILTIN(increase_descriptor_offset)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(atomicadd_elem_op)
+TIR_DEFINE_TL_BUILTIN(atomic_add_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_add_ret_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_addx2_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_addx4_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_load_elem_op)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_store_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_max_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_max_ret_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_min_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_min_ret_elem_op)
     .set_num_inputs(3)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -341,5 +422,75 @@ TIR_DEFINE_TL_BUILTIN(tcgen05_mma_arrive)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(warp_reduce_sum)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_max)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_min)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitand)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// ldg32(address, predicate(optional)) -> 32-bit value
+// Global memory load with 32-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg32).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// ldg64(address, predicate(optional)) -> 64-bit value
+// Global memory load with 64-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg64).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// ldg128(address, predicate(optional)) -> 128-bit value
+// Global memory load with 128-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg128).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// ldg256(address, predicate(optional)) -> 256-bit value
+// Global memory load with 256-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg256).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// stg32(Buffer, idx, value) -> void
+// Global memory store with 32-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg32).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg64(Buffer, idx, value) -> void
+// Global memory store with 64-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg64).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg128(Buffer, idx, value) -> void
+// Global memory store with 128-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg128).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg256(Buffer, idx, value) -> void
+// Global memory store with 256-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg256).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index f5c7d9edc..464f3d39c 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -28,6 +28,10 @@ static constexpr const char *kWarpSpecializationScope =
 static constexpr const char *kCustomWarpSpecialization =
     "kCustomWarpSpecialization";
 static constexpr const char *kLocalVarInit = "tl.local_var_init";
+// A PrimFunc-level attribute carrying a list of handle Vars
+// that must NOT be marked with the restrict qualifier in codegen.
+// Type: Array<tir::Var>
+static constexpr const char *kNonRestrictParams = "tl.non_restrict_params";
 } // namespace attr
 
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
@@ -47,18 +51,50 @@ static constexpr const char *kPtxasRegisterUsageLevel =
 static constexpr const char *kEnablePTXASVerboseOutput =
     "tl.enable_ptxas_verbose_output";
 static constexpr const char *kDisableVectorize256 = "tl.disable_vectorize_256";
+static constexpr const char *kEnableVectorizePlannerVerbose =
+    "tl.enable_vectorize_planner_verbose";
 static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
-static constexpr const char *kStorageRewriteDetectInplace =
-    "tl.storage_rewrite_detect_inplace";
+static constexpr const char *kDisableLoopUnswitching =
+    "tl.disable_loop_unswitching";
+// Allow loop unswitching even when the else-version of the loop body is
+// non-trivial (has side effects). Default: false (conservative).
+static constexpr const char *kLoopUnswitchingAllowNonTrivialElse =
+    "tl.loop_unswitching_allow_non_trivial_else";
+
 /*!
- * \brief Whether to disable dynamic tail split
+ * \brief Enable lowering non-predicated global load/store to ldg/stg intrinsics
  *
- * kDisableDynamicTailSplit = "tl.disable_dynamic_tail_split"
+ * When enabled, transforms regular (non-predicated) global memory loads and
+ * stores to explicit ldg/stg intrinsics for potentially better performance.
+ * Default: OFF (disabled)
  *
+ * kEnableLowerLDGSTG = "tl.enable_lower_ldgstg"
  */
-static constexpr const char *kDisableDynamicTailSplit =
-    "tl.disable_dynamic_tail_split";
+static constexpr const char *kEnableLowerLDGSTG = "tl.enable_lower_ldgstg";
+
+/*!
+ * \brief Enable lowering predicated global load/store to ldg/stg intrinsics
+ *
+ * When enabled (set to true), predicated loads (if_then_else with else=0) and
+ * predicated stores (IfThenElse with store in then case) will be lowered
+ * to predicated ldg/stg intrinsics.
+ * Default: OFF (predicated lowering is disabled by default)
+ *
+ * kEnableLowerLDGSTGPredicated = "tl.enable_lower_ldgstg_predicated"
+ */
+static constexpr const char *kEnableLowerLDGSTGPredicated =
+    "tl.enable_lower_ldgstg_predicated";
+static constexpr const char *kStorageRewriteDetectInplace =
+    "tl.storage_rewrite_detect_inplace";
+static constexpr const char *kASTPrintEnable = "tl.ast_print_enable";
+static constexpr const char *kLayoutVisualizationEnable =
+    "tl.layout_visualization_enable";
+static constexpr const char *kLayoutVisualizationFormats =
+    "tl.layout_visualization_formats";
+static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
+static constexpr const char *kDisableDataRaceCheck =
+    "tl.disable_data_race_check";
 
 /*!
  * \brief Whether to disable thread storage synchronization
@@ -82,17 +118,8 @@ static constexpr const char *kDisableThreadStorageSync =
  */
 static constexpr const char *kForceLetInline = "tl.force_let_inline";
 
-/*!
- * \brief The size of the vectorized dimension in buffer, designed by user
- *
- * For example, if the vectorized dimension is 128 bits and the dtype of buffer
- * A[m, k] is float16, the size of the vectorized dimension (i.e. k) in buffer A
- * should be divisible by 8 (8 = 128 / 16).
- *
- * kDynamicAlignment = "tl.dynamic_alignment"
- *
- */
-static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
+static constexpr const char *kDisableOutOfBoundWarning =
+    "tl.disable_out_of_bound_warning";
 
 /*!
  * \brief Get the type of the CUDA tensor map
@@ -138,6 +165,11 @@ TVM_DLL const Op &ieee_frsqrt();
 // ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
 TVM_DLL const Op &ieee_fdiv();
 
+// random op
+TVM_DLL const Op &rng_init();
+TVM_DLL const Op &rng_rand();
+TVM_DLL const Op &rng_rand_float();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
@@ -160,17 +192,17 @@ TVM_DLL const Op &create_tma_descriptor();
 TVM_DLL const Op &create_tma_im2col_descriptor();
 
 /*!
- * \brief Create a list of mbarrier with num_threads
+ * \brief Create a list of mbarrier with arrive_counts for each barrier
  *
- * create_list_of_mbarrier(num_threads0, num_threads1, ...)
+ * create_list_of_mbarrier(arrive_counts0, arrive_counts1, ...)
  *
  */
 TVM_DLL const Op &create_list_of_mbarrier();
 
 /*!
- * \brief Get the mbarrier with barrier_id
+ * \brief Get the mbarrier injected by compiler via barrier_id
  *
- * int64_t* GetMBarrier(barrier_id)
+ * int64_t* get_mbarrier(barrier_id)
  *
  */
 TVM_DLL const Op &get_mbarrier();
@@ -310,6 +342,15 @@ TVM_DLL const Op &ptx_stmatrix();
  */
 TVM_DLL const Op &ptx_cp_async_barrier_noinc();
 
+/*!
+ * \brief TileLang intrinsic for PTX async copy from global to shared memory
+ *
+ * ptx_cp_async(dst_access_ptr, src_access_ptr, bytes)
+ * ptx_cp_async(dst_access_ptr, src_access_ptr, bytes, predicate)
+ *
+ */
+TVM_DLL const Op &ptx_cp_async();
+
 /*!
  * \brief Pack two b16 value into a b32 value
  *
@@ -438,6 +479,30 @@ TVM_DLL const Op &wait_wgmma();
  */
 TVM_DLL const Op &sync_grid();
 
+/*!
+ * \brief Synchronize all threads in a warp
+ *
+ * sync_warp()
+ *
+ */
+TVM_DLL const Op &sync_warp();
+
+/*!
+ * \brief Programmatic dependency trigger.
+ *
+ * pdl_trigger()
+ *
+ */
+TVM_DLL const Op &pdl_trigger();
+
+/*!
+ * \brief Programmatic grid dependency synchronization.
+ *
+ * pdl_sync()
+ *
+ */
+TVM_DLL const Op &pdl_sync();
+
 /*!
  * \brief tvm intrinsic for loop continue
  *
@@ -555,7 +620,77 @@ TVM_DLL const Op &increase_descriptor_offset();
  *  This op is used to represent an element-wise atomic add operation in
  * tilelang.
  */
-TVM_DLL const Op &atomicadd_elem_op();
+TVM_DLL const Op &atomic_add_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic addition with return value.
+ *
+ *  This op is used to represent an element-wise atomic add operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_add_ret_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for vectorized (x2) atomic addition.
+ *
+ *  This op is used to represent a vectorized atomic add operation (2 elements)
+ * in tilelang.
+ */
+TVM_DLL const Op &atomic_addx2_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for vectorized (x4) atomic addition.
+ *
+ *  This op is used to represent a vectorized atomic add operation (4 elements)
+ * in tilelang.
+ */
+TVM_DLL const Op &atomic_addx4_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for atomic load.
+ *
+ *  This op is used to represent an atomic load operation in tilelang.
+ */
+TVM_DLL const Op &atomic_load_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for atomic store.
+ *
+ *  This op is used to represent an atomic store operation in tilelang.
+ */
+TVM_DLL const Op &atomic_store_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic maximum.
+ *
+ *  This op is used to represent an element-wise atomic max operation in
+ * tilelang.
+ */
+TVM_DLL const Op &atomic_max_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic maximum with return value.
+ *
+ *  This op is used to represent an element-wise atomic max operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_max_ret_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic minimum.
+ *
+ *  This op is used to represent an element-wise atomic min operation in
+ * tilelang.
+ */
+TVM_DLL const Op &atomic_min_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic minimum with return value.
+ *
+ *  This op is used to represent an element-wise atomic min operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_min_ret_elem_op();
 
 /*!
  * \brief tilelang intrinsic for assert on device.
@@ -571,6 +706,141 @@ TVM_DLL const Op &device_assert();
  */
 TVM_DLL const Op &device_assert_with_msg();
 
+/*!
+ * \brief tilelang intrinsic for warp reduction sum.
+ */
+TVM_DLL const Op &warp_reduce_sum();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction max.
+ */
+TVM_DLL const Op &warp_reduce_max();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction min.
+ */
+TVM_DLL const Op &warp_reduce_min();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitand.
+ */
+TVM_DLL const Op &warp_reduce_bitand();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitor.
+ */
+TVM_DLL const Op &warp_reduce_bitor();
+
+/*!
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
+ */
+TVM_DLL const Op &__ldg();
+
+/*!
+ * \brief tilelang intrinsic for global memory load with 32-bit vector width.
+ *
+ *  This op loads 32 bits (4 bytes) from global memory using explicit
+ *  PTX ld.global instructions for performance-sensitive loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg32(x, i)
+ */
+TVM_DLL const Op &ldg32();
+
+/*!
+ * \brief tilelang intrinsic for global memory load with 64-bit vector width.
+ *
+ *  This op loads 64 bits (8 bytes) from global memory using explicit
+ *  PTX ld.global.v2 instructions for vectorized loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg64(x, i)
+ */
+TVM_DLL const Op &ldg64();
+
+/*!
+ * \brief tilelang intrinsic for global memory load with 128-bit vector width.
+ *
+ *  This op loads 128 bits (16 bytes) from global memory using explicit
+ *  PTX ld.global.v4 or ld.global.v2.s64 instructions for wide vectorized loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg128(x, i)
+ */
+TVM_DLL const Op &ldg128();
+
+/*!
+ * \brief tilelang intrinsic for global memory load with 256-bit vector width.
+ *
+ *  This op loads 256 bits (32 bytes) from global memory using explicit
+ *  PTX ld.global.v4.s64 instructions for maximum vectorized loads.
+ *  Requires CUDA 12.9+ for native support; older versions use two 128-bit
+ * loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg256(x, i)
+ */
+TVM_DLL const Op &ldg256();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 32-bit vector width.
+ *
+ *  This op stores 32 bits (4 bytes) to global memory using explicit
+ *  PTX st.global instructions for performance-sensitive stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg32(y, i, value)
+ */
+TVM_DLL const Op &stg32();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 64-bit vector width.
+ *
+ *  This op stores 64 bits (8 bytes) to global memory using explicit
+ *  PTX st.global.v2 instructions for vectorized stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg64(y, i, value)
+ */
+TVM_DLL const Op &stg64();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 128-bit vector width.
+ *
+ *  This op stores 128 bits (16 bytes) to global memory using explicit
+ *  PTX st.global.v4 instructions for wide vectorized stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg128(y, i, value)
+ */
+TVM_DLL const Op &stg128();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 256-bit vector width.
+ *
+ *  This op stores 256 bits (32 bytes) to global memory using explicit
+ *  PTX st.global.v4.s64 instructions for maximum vectorized stores.
+ *  Requires CUDA 12.9+ for native support; older versions use two 128-bit
+ * stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg256(y, i, value)
+ */
+TVM_DLL const Op &stg256();
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 275af38ba..31a0c0092 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -3,7 +3,6 @@
  * \brief Define copy operator for various memory transfer strategies (Normal,
  *        Bulk/TMA, LDSM/STSM) and lowering logic for GPU code generation.
  *
- * This module is part of TVM TensorIR's Tensor Layout (TL) operations,
  * implementing memory copy operations that can target CPUs or GPUs with
  * optimization for different instructions like bulk copy, matrix load/store,
  * and Hopper's new TMA (Tensor Memory Accelerator).
@@ -13,13 +12,10 @@
 #include "../layout/tcgen05_layout.h"
 #include "../target/utils.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
-#include "region.h"
+#include "utils.h"
 
-#include "../target/cuda.h"
-#include "../target/utils.h"
 #include "builtin.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -31,143 +27,27 @@ namespace tl {
 
 using namespace tir;
 
-/*!
- * \brief Helper to map TVM's DataType to CUDA's CUtensorMapDataType enum value.
- * This function converts TVM data types to CUDA tensor map data types for TMA
- * operations.
- */
-static int to_CUtensorMapDataType(DataType dtype) {
-  CUtensorMapDataType tp;
-  if (dtype.is_float()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else if (dtype.is_bfloat16()) {
-    tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_float8_e4m3() || dtype.is_float8_e5m2()) {
-    tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-  } else if (dtype.is_int()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_INT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_INT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else if (dtype.is_uint()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else {
-    ICHECK(0) << dtype;
-  }
-  return static_cast<int>(tp);
-}
-
-/*!
- * \brief Utility function to reverse an array.
- * This is commonly used to convert between row-major and column-major layouts.
- */
-template <typename T> static Array<T> ReverseArray(Array<T> array) {
-  return Array<T>{array.rbegin(), array.rend()};
-}
-
-/*!
- * \brief Construct a Copy operator node from call arguments and a buffer map.
- *
- * This constructor parses the first two entries of `args` as Call nodes
- * describing source and destination Regions (via RegionOp), extracts their
- * Buffers and Ranges, and stores them on the newly created CopyNode. It also
- * reads optional arguments:
- * - args[2] (IntImm): coalesced width (stored only if > 0),
- * - args[3] (Bool): disable TMA lowering flag,
- * - args[4] (IntImm): eviction policy.
- *
- * Preconditions:
- * - `args` must contain at least two Call-compatible PrimExpr entries
- * describing regions; an ICHECK will fail if they are not CallNodes.
- *
- * @param args Array of PrimExpr where:
- *   - args[0] is the source Region call,
- *   - args[1] is the destination Region call,
- *   - optional args[2..4] are coalesced width, disable_tma, and eviction
- * policy.
- * @param vmap BufferMap used to resolve RegionOp buffers and ranges.
- */
-Copy::Copy(Array<PrimExpr> args, BufferMap vmap) {
+// Constructs a Copy operator node from call arguments and annotations.
+// args[0]: source region, args[1]: destination region
+// annotations: Map containing coalesced_width, disable_tma, eviction_policy,
+// etc.
+Copy::Copy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<CopyNode> node = tvm::ffi::make_object<CopyNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
-  if (args.size() >= 3) {
-    auto coalesced_width = Downcast<IntImm>(args[2]);
-    if (coalesced_width->value > 0) {
-      node->coalesced_width = coalesced_width;
-    }
-  }
-  if (args.size() >= 4) {
-    node->disable_tma = Downcast<Bool>(args[3]);
-  }
-  if (args.size() >= 5) {
-    node->eviction_policy = args[4].as<IntImmNode>()->value;
-  }
+  // Copy annotations from the Call node
+  node->annotations = annotations;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a shallow clone of this CopyNode as a TileOperator.
- *
- * Produces a new CopyNode object copy-constructed from this node. If a parallel
- * sub-operation (par_op_) is present, the sub-operation is cloned as well and
- * attached to the new node. The returned value is a TileOperator wrapper
- * around the newly created node.
- *
- * @return TileOperator A TileOperator owning the cloned CopyNode.
- */
+// Creates a shallow clone of this CopyNode.
 TileOperator CopyNode::Clone() const {
   auto op = tvm::ffi::make_object<CopyNode>(*this);
   if (par_op_.defined()) {
@@ -176,35 +56,103 @@ TileOperator CopyNode::Clone() const {
   return Copy(op);
 }
 
-/*!
- * \brief Create iterator variables for the copy operation.
- * This function creates iteration variables for dimensions that have extent
- * > 1. \return Array of IterVar representing the iterator variables for the
- * copy operation.
- */
+// Creates iterator variables for dimensions with extent > 1.
 Array<IterVar> CopyNode::MakeIterVars() const {
+  // Choose the range set from the lowest-level memory scope between src and
+  // dst. Scope levels: global < shared/shared.dyn/shared.tmem < local.fragment
+  // (fragment)
+  auto scope_level = [](const Buffer &b) -> int {
+    String s = b.scope();
+    if (s == "local.fragment" || s == "local")
+      return 2;
+    if (s == "shared" || s == "shared.dyn" || s == "shared.tmem")
+      return 1;
+    // default to global level for unknown scopes
+    return 0;
+  };
+
+  int src_level = scope_level(src);
+  int dst_level = scope_level(dst);
+  bool base_is_src = (src_level >= dst_level);
+  const Array<Range> &base_ranges = base_is_src ? src_range : dst_range;
+
+  // Sanity check: when switching away from the original (src_range),
+  // ensure the chosen base ranges are not provably smaller than the original
+  // per dimension. This guards against generating undersized loop domains.
+  // Improved logic: use two pointers to traverse both base_ranges and
+  // src_range, skipping dimensions with extent == 1. The number of non-1
+  // extents must match.
+  arith::Analyzer analyzer;
+
+  size_t base_dim = 0, src_dim = 0;
+  while (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+    // Skip base extents that are 1
+    while (base_dim < base_ranges.size() &&
+           is_one(base_ranges[base_dim]->extent)) {
+      ++base_dim;
+    }
+    // Skip src extents that are 1
+    while (src_dim < src_range.size() && is_one(src_range[src_dim]->extent)) {
+      ++src_dim;
+    }
+    // Both indices now at non-1, or at end
+    if (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+      PrimExpr base_ext = base_ranges[base_dim]->extent;
+      PrimExpr src_ext = src_range[src_dim]->extent;
+      // Only fail if base extent is provably smaller than src extent
+      if (analyzer.CanProve(base_ext < src_ext)) {
+        std::ostringstream oss;
+        oss << "Selected loop range is smaller than original src range at "
+               "matched non-1 dimension: "
+            << "base(extent=" << base_ext
+            << ", scope=" << (base_is_src ? src.scope() : dst.scope())
+            << ", min=" << base_ranges[base_dim]->min
+            << ", base_dim=" << base_dim << ") < src(extent=" << src_ext
+            << ", min=" << src_range[src_dim]->min << ", src_dim=" << src_dim
+            << ", scope=" << src.scope() << ") for src=" << src->name
+            << ", dst=" << dst->name << "\n";
+        oss << "src buffer: " << src->name << ", scope=" << src.scope() << "\n";
+        oss << "dst buffer: " << dst->name << ", scope=" << dst.scope() << "\n";
+        oss << "base_ranges[" << base_dim
+            << "]: min=" << base_ranges[base_dim]->min
+            << ", extent=" << base_ext << "\n";
+        oss << "src_ranges[" << src_dim << "]: min=" << src_range[src_dim]->min
+            << ", extent=" << src_ext << "\n";
+        LOG(FATAL) << oss.str();
+      }
+      ++base_dim;
+      ++src_dim;
+    }
+  }
+
+  // Any remaining unmatched dimensions in either range must all have extent ==
+  // 1
+  while (base_dim < base_ranges.size()) {
+    ICHECK(is_one(base_ranges[base_dim]->extent))
+        << "base_ranges has extra non-1 extent at dim " << base_dim;
+    ++base_dim;
+  }
+  while (src_dim < src_range.size()) {
+    ICHECK(is_one(src_range[src_dim]->extent))
+        << "src_range has extra non-1 extent at dim " << src_dim;
+    ++src_dim;
+  }
+
   Array<IterVar> loop_vars;
   size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
+  for (size_t i = 0; i < base_ranges.size(); i++) {
+    if (is_one(base_ranges[i]->extent))
       continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    Var var = Var(std::string{char('i' + idx)}, base_ranges[i]->extent->dtype);
     idx++;
     loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+        {Range(0, base_ranges[i]->extent), var, IterVarType::kDataPar});
   }
   return loop_vars;
 }
 
-/*!
- * \brief Create s for the copy operation.
- * This function generates the actual index expressions for accessing source or
- * destination buffers. For dimensions with extent=1, it uses the range minimum;
- * for others, it adds the iteration variable. \param ivs Array of IterVar
- * returned by MakeIterVars(). \param src_dst 0 for src_indices, 1 for
- * dst_indices. \return Array of PrimExpr representing the indices for the copy
- * operation.
- */
+// Generates index expressions for accessing src (src_dst=0) or dst (src_dst=1)
+// buffers.
 Array<PrimExpr> CopyNode::MakeIndices(const Array<IterVar> &ivs,
                                       int src_dst) const {
   Array<PrimExpr> indices;
@@ -224,32 +172,13 @@ Array<PrimExpr> CopyNode::MakeIndices(const Array<IterVar> &ivs,
   return indices;
 }
 
-/**
- * @brief Build a boundary predicate that guards memory accesses for the copy.
- *
- * Constructs a conjunction of per-dimension bounds checks (e.g. `min + iv <
- * extent` and `min + iv >= 0`) for every dynamic dimension involved in the
- * copy. Uses the provided arithmetic analyzer to elide checks that can be
- * proven statically.
- *
- * The function ICHECKs that the supplied `extents` align with the operator's
- * recorded ranges for the selected side (source when `src_dst == 0`,
- * destination when `src_dst == 1`).
- *
- * @param ivs IterVars corresponding to the varying dimensions of the copy. Each
- *   IterVar maps to a non-unit extent dimension in the stored ranges.
- * @param extents Extents of the tensor being accessed (must match the number of
- *   ranges); used as the upper bounds for generated checks.
- * @param src_dst Selects which side's ranges to use: `0` for source, `1` for
- *   destination.
- * @return PrimExpr A conjunction of necessary bounds checks, or an empty
- * `PrimExpr` (null) if all checks are provably true and no predicate is
- * required.
- */
+// Builds a boundary predicate for memory accesses.
+// Returns a conjunction of bounds checks, or empty PrimExpr if all checks pass.
 PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
                                  const Array<IterVar> &ivs,
                                  Array<PrimExpr> extents, int src_dst) const {
   Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
   Array<PrimExpr> cond_list;
   ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
   size_t idx = 0;
@@ -276,33 +205,13 @@ PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
   }
 }
 
-/**
- * @brief Construct a SIMT-style nested loop that implements the copy.
- *
- * Builds a loop nest that performs element-wise loads from the source buffer
- * and stores into the destination buffer. For a scalar copy (no varying
- * iteration dimensions) this returns a single serial loop executing one
- * store. For multi-dimensional copies it:
- * - creates data-parallel loops (Parallel For) for each varying dimension,
- * - binds the resulting iteration variables to the provided arithmetic
- *   analyzer for simplification,
- * - computes source and destination index expressions,
- * - applies per-buffer boundary predicates (if needed) to mask out-of-range
- *   accesses,
- * - inserts a cast when src and dst dtypes differ,
- * - applies an optional `coalesced_width` annotation to generated parallel
- *   loops when present.
- *
- * @param analyzer Analyzer used to simplify and bind loop variable domains.
- * @return For A nested For statement representing the generated SIMT loop nest.
- */
+// Constructs a SIMT-style nested loop that implements the copy.
 For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<IterVar> loop_vars = MakeIterVars();
   bool is_scalar = loop_vars.empty();
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
-
   ICHECK(loop_vars.size() <= src_range.size())
       << "loop_vars.size() = " << loop_vars.size()
       << ", src_range.size() = " << src_range.size() << ", src = " << src->name
@@ -331,31 +240,32 @@ For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   if (is_scalar) {
     return For(Var("i"), 0, 1, ForKind::kSerial, body);
   }
+
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
-    Map<String, ObjectRef> annotations = {};
-    if (coalesced_width.defined()) {
-      annotations.Set("coalesced_width", coalesced_width);
+    Map<String, ObjectRef> loop_annotations;
+
+    // Only attach the parallel related annotations on the outermost loop (i ==
+    // 0)
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
+      if (annotations.count(attr::kParallelLoopLayout)) {
+        loop_annotations.Set(
+            attr::kParallelLoopLayout,
+            annotations.Get(attr::kParallelLoopLayout).value());
+      }
     }
+
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
-               ForKind::kParallel, body, std::nullopt, annotations);
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
   }
   return Downcast<For>(body);
 }
 
-/**
- * @brief Compute a linearized shared-memory layout used for TMA transfers.
- *
- * Creates a Layout that maps an N-D shared tensor into a 1-D-like ordering
- * suitable for TMA by blocking each dimension into 256-element tiles and
- * splitting each original index into a quotient and remainder. Effectively
- * transforms each index i_k into two coordinates: floor(i_k / 256) and
- * i_k % 256, producing an ordering equivalent to concatenating all quotients
- * followed by all remainders.
- *
- * @param shared_tensor The shared-memory buffer whose shape defines the input
- *        dimensions for the layout inference.
- * @return Layout A Layout describing the linearized ordering for the TMA copy.
- */
+// Computes a linearized shared-memory layout for TMA transfers.
+// Maps [i, j] -> [i // 256, j // 256, i % 256, j % 256]
 Layout CopyNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
   Array<PrimExpr> input_size = shared_tensor->shape;
   Array<PrimExpr> forward_vars;
@@ -373,28 +283,8 @@ Layout CopyNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
   return Layout(input_size, forward_index);
 }
 
-/**
- * @brief Infer memory layouts for this Copy operation.
- *
- * Determines an appropriate LayoutMap for the copy based on the target and
- * enabled lowering paths. For TMA-capable targets when the chosen copy
- * instruction is BulkLoad or BulkStore, this may produce a linearized shared
- * memory layout suitable for TMA transfers (only when inference is invoked at
- * InferLevel::kFree and no layout for the shared buffer is already annotated).
- * For other cases (including LDSM/STSM and the normal copy path), layout
- * inference is delegated to the SIMT parallel operation produced by
- * MakeSIMTLoop().
- *
- * This method may read PassContext configuration (kDisableTMALower) and may
- * lazily construct and cache the parallel operation in par_op_ as a side
- * effect.
- *
- * @param T LayoutInferArgs containing target and the current layout map.
- * @param level The inference level controlling how aggressive/layouts may be
- *              proposed.
- * @return LayoutMap mapping buffers to inferred layouts (may be empty if no
- *         additional layouts are suggested).
- */
+// Infers memory layouts for this Copy operation based on target and copy
+// instruction.
 LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
                                 InferLevel level) const {
   auto target = T.target;
@@ -402,9 +292,23 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
   PassContext pass_ctx = PassContext::Current();
   bool disable_tma_lower =
       pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || disable_tma,
+  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
                                T.layout_map, T.analyzer, T.buffer_oob);
 
+  // If user annotated a loop layout on T.copy, enforce SIMT (normal) copy.
+  // Parallel-loop layout only applies to SIMT-style loops we generate here;
+  // other copy instructions (TMA/LDSM/STSM/TMem) are incompatible.
+  if (annotations.count(attr::kParallelLoopLayout)) {
+    if (copy_inst != CopyInst::kNormal) {
+      std::ostringstream oss;
+      oss << "T.copy loop layout annotation requires SIMT copy; got "
+          << CopyInstToString(copy_inst) << " for src=" << src->name
+          << ", dst=" << dst->name
+          << ". Remove loop_layout or change copy pattern.";
+      LOG(FATAL) << oss.str();
+    }
+  }
+
   // Handle tensor memory (tmem) layout inference
   if (copy_inst == CopyInst::kTMemLoad || copy_inst == CopyInst::kTMemStore) {
     // Tensor memory copy
@@ -469,23 +373,82 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     return results;
   }
 
-  if (copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore) {
+  if (copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore ||
+      copy_inst == CopyInst::kBulkLoad1D ||
+      copy_inst == CopyInst::kBulkStore1D) {
     // if can apply swizzling, we skip layout inference
     // for bulk load/store, we can directly apply the layout of normal copy
     // This must be a global/shared layout, so we can skip the parallel op
     // layout inference (parallel layout inference only annotate the loop layout
     // and the register layout).
-    bool is_load = copy_inst == CopyInst::kBulkLoad;
-    Buffer global_tensor = is_load ? src : dst;
-    Buffer shared_tensor = is_load ? dst : src;
+    Map<Buffer, Layout> result_map;
+
+    bool is_tma_1d = copy_inst == CopyInst::kBulkLoad1D ||
+                     copy_inst == CopyInst::kBulkStore1D;
+    bool is_load =
+        copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkLoad1D;
+    bool is_store = copy_inst == CopyInst::kBulkStore ||
+                    copy_inst == CopyInst::kBulkStore1D;
+    auto global_tensor = is_load ? src : dst;
+    auto shared_tensor = is_load ? dst : src;
+    auto shared_range = is_load ? dst_range : src_range;
+
+    if (is_tma_1d && shared_range.size() == 1) {
+      // 1D TMA Store with single dimension can not be swizzled
+      // But 1D TMA can also have multiple dimensions when the last
+      // dimension is continuous.
+      return result_map;
+    }
+
+    // Collect fragment buffers from indices and mark them as fully replicated
+    // For Bulk Load/Store, fragment buffers used as indices should be
+    // replicated across all threads
+    PrimExpr thread_extent = T.thread_bounds->extent;
+    for (const auto &range : src_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+    for (const auto &range : dst_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+
     // check shared layout is non-swizzle
     // skip layout inference if shared layout is already annotated
     if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
-      // create a new layout map for tma linear layout
-      Layout linear_layout = ComputeLinearLayout(shared_tensor);
-      return Map<Buffer, Layout>({{shared_tensor, linear_layout}});
+      if (is_store) {
+        // For BulkStore, we should perform swizzle if possible.
+        // TMA Store is always 1d like, we can directly use the last two
+        // dimensions to analysis swizzling.
+        int dim = shared_tensor->shape.size();
+        const int64_t mat_stride = *as_const_int(shared_tensor->shape[dim - 2]);
+        const int64_t mat_continuous =
+            *as_const_int(shared_tensor->shape[dim - 1]);
+        Layout swizzle_layout = makeGemmABLayoutHopper(
+            mat_stride, mat_continuous, mat_continuous,
+            shared_tensor->dtype.bits(), /*k_inner=*/true);
+        // If makeGemmABLayoutHopper returns a linear layout, fallback to
+        // ComputeLinearLayout which handles arbitrary tensor shapes correctly.
+        if (StructuralEqual()(swizzle_layout, makeLinearLayout(Array<PrimExpr>{
+                                                  Integer(mat_stride),
+                                                  Integer(mat_continuous)}))) {
+          result_map.Set(shared_tensor, ComputeLinearLayout(shared_tensor));
+        } else {
+          result_map.Set(shared_tensor, swizzle_layout);
+        }
+      } else if (level == InferLevel::kFree) {
+        // create a new layout map for tma linear layout
+        Layout linear_layout = ComputeLinearLayout(shared_tensor);
+        result_map.Set(shared_tensor, linear_layout);
+      }
     }
+    return result_map;
   }
+
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
   // Use parallel op to infer the layout
@@ -493,27 +456,11 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     arith::Analyzer analyzer;
     par_op_ = ParallelOp((MakeSIMTLoop(&analyzer)));
   }
-  return par_op_->InferLayout(T, level);
+  auto layout_map = par_op_->InferLayout(T, level);
+  return layout_map;
 }
-/**
- * @brief Determine whether this CopyNode can be lowered to a Bulk Load (TMA)
- * instruction.
- *
- * The function returns true when all of the following hold:
- * - the target architecture advertises bulk-copy/TMA support;
- * - the source buffer resides in global memory;
- * - the destination buffer resides in shared memory (either "shared" or
- * "shared.dyn");
- * - the source and destination have the same element data type.
- *
- * If the source and destination dtypes differ, a warning is logged and the
- * function returns false (the caller is expected to fall back to a normal
- * copy).
- *
- * @param target The compilation target to query for bulk-copy support.
- * @return true if the copy can be implemented as a Bulk Load (TMA); false
- * otherwise.
- */
+// Checks if this copy can be lowered to a Bulk Load (TMA) instruction.
+// Requires: TMA support, global->shared scope, matching dtypes.
 bool CopyNode::CheckBulkLoad(Target target, arith::Analyzer *analyzer,
                              bool check_last_dim) const {
   // 1. arch must have bulk copy support
@@ -556,10 +503,14 @@ bool CopyNode::CheckBulkCopy1D(const Buffer &global_tensor,
                                const LayoutMap &layout_map,
                                arith::Analyzer *analyzer) const {
 
-  // Step 1: check shared is contiguous
+  // Step 1: check shared is contiguous (linear layout is also contiguous)
   bool shared_is_contiguous = true;
   if (layout_map.count(shared_tensor)) {
-    shared_is_contiguous = false;
+    // Check if the layout is linear
+    Layout existing =
+        layout_map.Get(shared_tensor).value().as<Layout>().value();
+    Layout linear_layout = makeLinearLayout(shared_tensor->shape);
+    shared_is_contiguous = StructuralEqual()(existing, linear_layout);
   }
   // Step 2: check global is contiguous
   bool global_is_contiguous = true;
@@ -620,18 +571,8 @@ bool CopyNode::CheckBulkStore1D(Target target, const LayoutMap &layout_map,
                          shared_range, layout_map, analyzer);
 }
 
-/**
- * @brief Determine if this CopyNode can be lowered to a CUDA BulkStore (TMA
- * store).
- *
- * Checks whether the target supports bulk copy, the source buffer is in shared
- * memory (shared or shared.dyn), the destination buffer is in global memory,
- * and both buffers have the same element data type. If the data types differ,
- * a warning is logged and false is returned.
- *
- * @param target Target device/architecture to check for bulk-copy support.
- * @return true if all conditions for a BulkStore are met; false otherwise.
- */
+// Checks if this copy can be lowered to a Bulk Store (TMA) instruction.
+// Requires: TMA support, shared->global scope, matching dtypes.
 bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
                               bool check_last_dim) const {
   // 1. arch must have bulk copy support
@@ -666,82 +607,38 @@ bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
   return true;
 }
 
-/*!
- * \brief Check if the copy operation is a LDSM copy.
- * This function verifies if the copy operation can be implemented using CUDA's
- * Load Matrix (LDSM) instruction. Requirements include: target supports
- * LDMATRIX, source is shared.dyn, destination is local.fragment. \param target
- * Target device. \return True if the copy operation is a LDSM copy, false
- * otherwise.
- */
+// Checks if copy can use CUDA's Load Matrix (LDSM) instruction.
+// Requires: LDMATRIX support, shared->fragment scope.
 bool CopyNode::CheckLDSMCopy(Target target) const {
   return TargetHasLdmatrix(target) &&
          (src.scope() == "shared.dyn" || src.scope() == "shared") &&
-         dst.scope() == "local.fragment";
+         IsFragmentBuffer(dst);
 }
 
-/**
- * @brief Determine whether this copy can use the STMATRIX store (STSM) path.
- *
- * Returns true when the target supports STMATRIX and the source buffer is in
- * the `local.fragment` scope while the destination buffer is in shared memory
- * (`shared` or `shared.dyn`).
- *
- * @param target The compilation target to query for STMATRIX support.
- * @return true if the copy may be lowered to an STSM instruction; false
- * otherwise.
- */
+// Checks if copy can use CUDA's Store Matrix (STSM) instruction.
+// Requires: STMATRIX support, fragment->shared scope.
 bool CopyNode::CheckSTSMCopy(Target target) const {
-  return TargetHasStmatrix(target) && src.scope() == "local.fragment" &&
+  return TargetHasStmatrix(target) && IsFragmentBuffer(src) &&
          (dst.scope() == "shared.dyn" || dst.scope() == "shared");
 }
 
-/**
- * @brief Determine whether this copy can use tensor memory load (tcgen05.ld).
- *
- * Returns true when the target supports tensor memory and the source buffer is
- * in `shared.tmem` scope while the destination buffer is in `local.fragment`.
- *
- * @param target The compilation target to query for tensor memory support.
- * @return true if the copy may be lowered to a tcgen05.ld instruction; false
- * otherwise.
- */
+// Checks if copy can use tensor memory load (tcgen05.ld).
+// Requires: tmem support, shared.tmem->fragment scope.
 bool CopyNode::CheckTMemLoad(Target target) const {
   return TargetHasTmem(target) && src.scope() == "shared.tmem" &&
-         dst.scope() == "local.fragment";
+         IsFragmentBuffer(dst);
 }
 
-/**
- * @brief Determine whether this copy can use tensor memory store (tcgen05.st).
- *
- * Returns true when the target supports tensor memory and the source buffer is
- * in `local.fragment` scope while the destination buffer is in `shared.tmem`.
- *
- * @param target The compilation target to query for tensor memory support.
- * @return true if the copy may be lowered to a tcgen05.st instruction; false
- * otherwise.
- */
+// Checks if copy can use tensor memory store (tcgen05.st).
+// Requires: tmem support, fragment->shared.tmem scope.
 bool CopyNode::CheckTMemStore(Target target) const {
-  return TargetHasTmem(target) && src.scope() == "local.fragment" &&
+  return TargetHasTmem(target) && IsFragmentBuffer(src) &&
          dst.scope() == "shared.tmem";
 }
 
-/**
- * @brief Selects the most specific copy instruction supported for the given
- * target and buffers.
- *
- * Determines which specialized copy lowering to use (TMA bulk load/store, LDSM,
- * STSM, TMem load/store) based on target capabilities and the memory scopes of
- * the source/destination buffers. If TMA lowering is disabled via the flag,
- * BulkLoad/BulkStore are not selected. The selection priority is: TMemLoad,
- * TMemStore, BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM, then
- * Normal (fallback).
- *
- * @param target The compilation target used to query hardware capabilities.
- * @param disable_tma_lower If true, prevents selecting TMA-based bulk
- * load/store instructions.
- * @return CopyInst The chosen copy instruction enum value.
- */
+// Selects the most specific copy instruction for the given target and buffers.
+// Priority: BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM, TMemLoad,
+// TMemStore, Normal.
 CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
                                const LayoutMap &layout_map,
                                arith::Analyzer *analyzer,
@@ -775,18 +672,8 @@ CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
   }
 }
 
-/*!
- * \brief Lower the copy operation to PTX code.
- * This function converts the high-level copy operation into low-level PTX
- * instructions. It dispatches to specialized lowering functions based on the
- * determined copy instruction type:
- * - Bulk Load/Store: Uses Tensor Memory Accelerator (TMA) instructions
- * - LDSM/STSM: Uses matrix load/store instructions for tensor cores
- * - Normal: Uses standard load/store operations with loop transformations
- * \param T LowerArgs containing target and layout map.
- * \param analyzer Arithmetic analyzer for simplification.
- * \return Stmt representing the PTX code for the copy operation.
- */
+// Lowers the copy operation to PTX code by dispatching to specialized lowering
+// functions.
 Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
 
@@ -794,7 +681,7 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   PassContext pass_ctx = PassContext::Current();
   bool disable_tma_lower =
       pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || disable_tma,
+  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
                                T.layout_map, analyzer);
   if (copy_inst == CopyInst::kTMemLoad || copy_inst == CopyInst::kTMemStore) {
     auto tmem_copy = LowerTmemCopy(T, analyzer);
@@ -821,84 +708,48 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   }
 }
 
-/**
- * @brief Lower the copy operator using the generic (non-specialized) path.
- *
- * Generates standard load/store code paths for targets that cannot or should
- * not use specialized copy instructions (TMA, LDSM/STSM). Builds a SIMT loop,
- * fuses and transforms parallel loops, infers and applies loop layouts on GPU
- * targets, partitions by thread, and applies vectorization appropriate to the
- * device (CPU or GPU). If a thread-level predicate is required, the resulting
- * body is guarded with an IfThenElse.
- *
- * @param T Lowering context including the target, thread bounds, thread var,
- *          layout map, and buffer remapping used during layout inference and
- *          loop partitioning.
- * @param analyzer Arithmetic analyzer used to simplify and reason about bounds
- *                 during loop partitioning and predicate construction.
- * @return Stmt Lowered statement representing the transformed, vectorized
- *              normal-copy loop (possibly wrapped in a predicate).
- */
+// Lowers the copy using standard load/store with loop transformations.
 Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   bool is_cpu_target = T.target->GetTargetDeviceType() == kDLCPU;
   auto simt_loop = MakeSIMTLoop(analyzer);
   auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
 
-  auto transformed_loop =
-      Downcast<For>(ParallelLoopTransformer::Substitute(fused_loop));
-
   For vectorized_thread_loop;
-  auto par_op = ParallelOp(transformed_loop);
+  auto par_op = ParallelOp(fused_loop);
 
-  if (is_cpu_target) {
-    vectorized_thread_loop = VectorizeLoop(transformed_loop);
+  if (is_cpu_target || IsLocalBuffer(src) || IsLocalBuffer(dst)) {
+    if (IsLocalBuffer(src) && !IsLocalBuffer(dst)) {
+      LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
+                   << dst.scope() << " buffer `" << dst->name
+                   << "` may cause conflicted write.";
+    }
+    vectorized_thread_loop = VectorizeLoop(fused_loop, T.layout_map);
+    return vectorized_thread_loop;
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
     for (auto level : levels) {
-      par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                           false, T.buffer_remap},
+      par_op->InferLayout({T.target,
+                           T.thread_bounds,
+                           T.layout_map,
+                           analyzer,
+                           false,
+                           T.buffer_remap,
+                           {}},
                           level);
     }
     auto loop_layout = par_op->GetLoopLayout();
-    auto thread_var = T.thread_var;
-    auto thread_loop =
-        PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
-    vectorized_thread_loop = VectorizeLoop(thread_loop);
+    // Use LowerParallelLoop to handle partitioning, vectorization, and
+    // predicate
+    return LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var,
+                             analyzer, T.layout_map,
+                             par_op->GetPredicate(T.thread_var));
   }
-
-  if (par_op->GetPredicate(T.thread_var).defined()) {
-    return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
-                      vectorized_thread_loop);
-  }
-  return vectorized_thread_loop;
 }
 
-/**
- * @brief Lower a Copy operator to LDSM/STSM (warp-level 8x8 matrix)
- * instructions.
- *
- * Lowers a CopyNode into PTX matrix load/store (LDSM/STSM) sequences when the
- * access/layouts meet the hardware constraints required by warp-level 8x8
- * fragment transfers (thread-mapped 8x8 fragment layout, 16-byte contiguous
- * shared memory accesses, full-range local tiles, matching dtypes for loads,
- * and no access predicates). If these conditions are not met the function
- * falls back to lowering via LowerNormalCopy().
- *
- * The routine validates layout/thread-mapping compatibility (including support
- * for transposed fragment layouts), determines vectorization factor (4/2/1)
- * based on extent alignment, computes shared/local addresses, emits the
- * appropriate ptx_ldmatrix/ptx_stmatrix call(s), and wraps them in a small
- * loop that may be unrolled and adjusted for thread-bounds offsets.
- *
- * @param T Lowering context (target, layout/ buffer remaps, thread/ bounds).
- * @param analyzer Arithmetic analyzer used to simplify and prove bounds.
- * @param copy_inst Must be either CopyInst::kLDSM or CopyInst::kSTSM to select
- *                  matrix-load vs matrix-store lowering.
- * @return Stmt A statement implementing the LDSM/STSM lowering, or the result
- *              of LowerNormalCopy(...) when constraints require fallback.
- */
+// Lowers copy to LDSM/STSM (warp-level 8x8 matrix) instructions.
+// Falls back to LowerNormalCopy if hardware constraints are not met.
 Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
                              CopyInst copy_inst) const {
   ICHECK(copy_inst == CopyInst::kLDSM || copy_inst == CopyInst::kSTSM)
@@ -922,6 +773,20 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
 
   Buffer shared_tensor = is_ldmatrix ? src : dst;
   Buffer local_tensor = is_ldmatrix ? dst : src;
+  Array<Range> local_region = is_ldmatrix ? src_range : dst_range;
+  bool is_full_range = true;
+  for (size_t i = 0; i < local_region.size(); i++) {
+    if (!analyzer->CanProveEqual(local_region[i]->extent,
+                                 local_tensor->shape[i])) {
+      is_full_range = false;
+      break;
+    }
+  }
+  if (!is_full_range) {
+    // ldmatrix/stmatrix can only support full range, will be fallback to
+    // normal copy
+    return LowerNormalCopy(T, analyzer);
+  }
 
   Array<PrimExpr> local_indices = MakeIndices(loop_vars, is_ldmatrix ? 1 : 0);
   Fragment local_layout = Downcast<Fragment>(T.layout_map[local_tensor]);
@@ -936,14 +801,6 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   Array<PrimExpr> shared_indices = MakeIndices(loop_vars, is_ldmatrix ? 0 : 1);
-  Array<PrimExpr> shared_indices_transformed = shared_indices;
-  Layout shared_layout;
-  if (T.buffer_remap.count(shared_tensor)) {
-    shared_layout = T.layout_map[shared_tensor];
-    shared_tensor = T.buffer_remap[shared_tensor];
-    shared_indices_transformed = shared_layout->Forward(shared_indices);
-  }
-
   // Check local_layout follows 8x8 layout
   // LDSM/STSM instructions require 8x8 matrix fragment layout
   // This matches the warp-level matrix multiplication pattern used in tensor
@@ -962,13 +819,13 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   PrimExpr local_indices_flattened =
       local_tensor.OffsetOf(local_indices_transformed).back();
   if (analyzer->CanProveEqual(matrix_8x8_thread_map, local_layout_thread_map) &&
-      IndiceCanVectorize(local_indices_flattened, col_var->var,
-                         col_var->dom->extent, 2, analyzer)) {
+      IndicesCanVectorize(local_indices_flattened, col_var->var,
+                          col_var->dom->extent, 2, analyzer)) {
     is_transposed = false;
   } else if (analyzer->CanProveEqual(matrix_8x8_thread_map_trans,
                                      local_layout_thread_map) &&
-             IndiceCanVectorize(local_indices_flattened, row_var->var,
-                                row_var->dom->extent, 2, analyzer)) {
+             IndicesCanVectorize(local_indices_flattened, row_var->var,
+                                 row_var->dom->extent, 2, analyzer)) {
     is_transposed = true;
   } else {
     // TMA ldmatrix/stmatrix cannot support non-8x8 layout, will be fallback to
@@ -983,10 +840,9 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     // be fallback to normal copy
     return LowerNormalCopy(T, analyzer);
   }
-  PrimExpr flattened_indice =
-      shared_tensor.OffsetOf(shared_indices_transformed).back();
-  if (!IndiceCanVectorize(flattened_indice, loop_vars.back()->var,
-                          loop_vars.back()->dom->extent, 8, analyzer)) {
+  PrimExpr flattened_indice = shared_tensor.OffsetOf(shared_indices).back();
+  if (!IndicesCanVectorize(flattened_indice, loop_vars.back()->var,
+                           loop_vars.back()->dom->extent, 8, analyzer)) {
     // TMA ldmatrix/stmatrix cannot support non-16 bytes continuous layout, will
     // be fallback to normal copy
     return LowerNormalCopy(T, analyzer);
@@ -1002,11 +858,16 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   // Do the lowering here, try vectorized ldmatrix/stmatrix by 4/2/1
+  // now, local_tensor is local instead of shared.
   PrimExpr extent = local_tensor->shape[0];
   int num = 1;
   if (analyzer->CanProveEqual(FloorMod(extent, 8), 0))
+    // 16x16 -> full warp, we use x4, for 32 threads in a warp, each thread can
+    // hold 4 elements
     num = 4;
   else if (analyzer->CanProveEqual(FloorMod(extent, 4), 0))
+    // 8x16 -> half warp, we use x2, for 32 threads in a warp, each thread can
+    // hold 2 elements
     num = 2;
 
   Array<PrimExpr> args;
@@ -1024,18 +885,21 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   Layout inv = local_layout->Inverse();
   Array<PrimExpr> shared_coords;
   PrimExpr warp = FloorDiv(T.thread_var, 32) * 32;
-  if (!is_transposed)
-    shared_coords = inv->Forward(
-        {local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num),
-         warp + FloorMod(T.thread_var, 8) * 4});
-  else
-    shared_coords = inv->Forward(
-        {local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num) +
-             FloorMod(T.thread_var, 2),
-         warp + FloorDiv(FloorMod(T.thread_var, 8), 2)});
+  if (!is_transposed) {
+    auto local_index = analyzer->Simplify(
+        local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num));
+    auto thread_index =
+        analyzer->Simplify(warp + FloorMod(T.thread_var, 8) * 4);
+    shared_coords = inv->Forward({local_index, thread_index});
+  } else {
+    auto local_index = analyzer->Simplify(
+        local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num) +
+        FloorMod(T.thread_var, 2));
+    auto thread_index =
+        analyzer->Simplify(warp + FloorDiv(FloorMod(T.thread_var, 8), 2));
+    shared_coords = inv->Forward({local_index, thread_index});
+  }
   shared_coords.pop_back(); // remove rep
-  if (shared_layout.defined())
-    shared_coords = shared_layout->Forward(shared_coords);
   PrimExpr shared_addr = shared_tensor.access_ptr(
       is_ldmatrix ? 1 : 2, DataType::Handle(), 1,
       shared_tensor.OffsetOf(shared_coords).back(), PrimExpr(2 * num));
@@ -1081,30 +945,8 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   return for_node;
 }
 
-/**
- * @brief Lower tensor memory copy operations (tcgen05.ld/st/cp).
- *
- * Handles copy operations involving shared.tmem buffers (tensor memory on
- * SM100/Blackwell). Supports three types of tensor memory copies:
- * - tcgen05.ld: tensor memory -> register (local.fragment)
- * - tcgen05.st: register (local.fragment) -> tensor memory
- * - tcgen05.cp: shared memory -> tensor memory
- *
- * The function validates buffer scopes, extracts 2D loop structure, performs
- * layout compatibility checks, selects an appropriate TCGEN05 instruction
- * variant based on data width and thread count, and emits the corresponding PTX
- * intrinsic call.
- *
- * Currently only tcgen05.ld is fully supported; st/cp will trigger an ICHECK
- * failure.
- *
- * @param T Lowering context (target, thread bounds, layout maps, buffer
- * remaps).
- * @param analyzer Arithmetic analyzer for proving bounds and simplifying
- * expressions.
- * @return Stmt The lowered tensor memory copy statement, or an empty Stmt if
- * this copy does not involve tensor memory.
- */
+// Lowers tensor memory copy operations (tcgen05.ld/st/cp).
+// Currently only tcgen05.ld is fully supported.
 Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
                              arith::Analyzer *analyzer) const {
   if (src.scope() != "shared.tmem" && dst.scope() != "shared.tmem") {
@@ -1117,16 +959,21 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
-  if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
+  bool src_needs_pack =
+      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
+  bool dst_needs_unpack =
+      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
+
+  if (src.scope() == "shared.tmem" && IsFragmentBuffer(dst)) {
     is_ld = true;
-  } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
+  } else if (IsFragmentBuffer(src) && dst.scope() == "shared.tmem") {
     is_st = true;
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: "
-              << "src scope = " << src.scope()
-              << ", dst scope = " << dst.scope();
+    LOG(FATAL) << "Unsupported tensor memory copy: "
+               << "src scope = " << src.scope()
+               << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1246,8 +1093,10 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
+      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ">"));
+                               std::to_string(num_chunks_each_wg) + ", " +
+                               bool_str + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
@@ -1281,32 +1130,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   return body;
 }
 
-/**
- * @brief Lower a Copy operator to a bulk TMA (Tensor Memory Accelerator)
- * transfer.
- *
- * Lowers the copy to an optimized TMA load or store when the target and buffer
- * layouts permit. Constructs a TMADesc, detects shared-memory
- * swizzle/interleave patterns, encodes global shape/stride/SMEM parameters, and
- * emits either a 1D TMA transfer (when global/shared are contiguous and element
- * counts match, currently only for loads) or a full multi-dimensional TMA call.
- * The emitted statement is guarded so only the thread with min thread id
- * executes the TMA.
- *
- * If preconditions are not satisfied (unsupported swizzle, stride/size limits,
- * mismatched element counts, OOB risks, or other hardware constraints), this
- * function falls back to LowerNormalCopy.
- *
- * @param T LowerArgs containing target information, thread/bounds variables,
- *          and layout/ buffer remap information used for descriptor
- * construction.
- * @param analyzer Analyzer used to prove shapes/contiguity/equality
- * constraints.
- * @param copy_inst Indicates whether to emit a BulkLoad (TMA load) or BulkStore
- *                  (TMA store). Must be CopyInst::kBulkLoad or kBulkStore.
- * @return Stmt A TIR statement performing the bulk TMA copy (or the result of
- *         LowerNormalCopy when falling back).
- */
+// Lowers copy to a bulk TMA (Tensor Memory Accelerator) transfer.
+// Falls back to LowerNormalCopy if preconditions are not satisfied.
 Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
                              CopyInst copy_inst) const {
   ICHECK(copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore)
@@ -1504,7 +1329,12 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   auto inner_box_dim = as_const_int(desc.smem_box[0]);
-  ICHECK(inner_box_dim != nullptr);
+  if (inner_box_dim == nullptr) {
+    LOG(WARNING) << "inner_box_dim " << desc.smem_box[0]
+                 << " can only be a constant integer for TMA bulk copy, "
+                    "fallback to normal copy";
+    return LowerNormalCopy(T, analyzer);
+  }
   int instruction_dim = *inner_box_dim;
   if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B)) {
     instruction_dim = 64 / src->dtype.bytes();
@@ -1573,7 +1403,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     int need_reduce = 0;
     if (!is_load)
       args.push_back(need_reduce);
-    args.push_back(this->eviction_policy);
+    args.push_back(GetEvictionPolicy());
     tma_copy = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
                    Evaluate(Call(DataType::Handle(), op, args)));
   } else {
@@ -1585,7 +1415,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     int need_reduce = 0;
     if (!is_load)
       args.push_back(need_reduce);
-    args.push_back(this->eviction_policy);
+    args.push_back(GetEvictionPolicy());
     tma_copy = Evaluate(Call(DataType::Handle(), op, args));
   }
   tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
@@ -1657,24 +1487,19 @@ Stmt CopyNode::LowerBulkCopy1D(const LowerArgs &T, arith::Analyzer *analyzer,
     tma_copy = Evaluate(
         Call(DataType::Handle(), tma_load(),
              {shared_addr, global_addr, 0,
-              elements * shared_tensor->dtype.bytes(), this->eviction_policy}));
+              elements * shared_tensor->dtype.bytes(), GetEvictionPolicy()}));
   } else {
     int need_reduce = 0;
     tma_copy = Evaluate(
         Call(DataType::Handle(), tma_store(),
              {global_addr, shared_addr, elements * shared_tensor->dtype.bytes(),
-              need_reduce, this->eviction_policy}));
+              need_reduce, GetEvictionPolicy()}));
   }
   tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
   return tma_copy;
 }
-/*!
- * \brief Encode the TMA descriptor into an array of PrimExpr.
- * This function serializes the TMA descriptor fields into a format suitable for
- * passing to the create_tma_descriptor() builtin function. The encoding follows
- * the expected argument order for the TMA descriptor creation.
- * \return Array of PrimExpr representing the encoded TMA descriptor.
- */
+// Encodes the TMA descriptor into an array of PrimExpr for
+// create_tma_descriptor().
 Array<PrimExpr> TMADesc::EncodeCallArgs() const {
   Array<PrimExpr> args;
   args.reserve(rank * 4 + 7);
@@ -1698,109 +1523,55 @@ Array<PrimExpr> TMADesc::EncodeCallArgs() const {
   return args;
 }
 
-/**
- * @brief Construct a Conv2DIm2ColOp node.
- *
- * Initializes a Conv2DIm2ColOpNode from raw TL-call arguments and a buffer map.
- * The constructor extracts source and destination Buffers from vmap and reads
- * convolution parameters encoded in args:
- * - args[0]: source tensor access pointer
- * - args[1]: destination tensor access pointer
- * - args[2]: nhw_step (PrimExpr)
- * - args[3]: c_step (PrimExpr)
- * - args[4]: kernel (IntImm)
- * - args[5]: stride (IntImm)
- * - args[6]: dilation (IntImm)
- * - args[7]: padding (IntImm)
- * - args[8]: eviction_policy (IntImm)
- *
- * The created node stores these values (src, dst, nhw_step, c_step, kernel,
- * stride, dilation, padding, eviction_policy) for later lowering to TMA-based
- * GPU intrinsics.
- *
- * @param args Array of PrimExpr TL-call arguments (see list above).
- * @param vmap Mapping from original buffer variables to actual Buffer objects.
- */
-Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap) {
+// Constructs a Conv2DIm2ColOp node from call arguments.
+// args: src, dst, nhw_step, c_step, kernel, stride, dilation, padding,
+// eviction_policy
+Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args,
+                               Map<String, ObjectRef> annotations) {
   ObjectPtr<Conv2DIm2ColOpNode> node =
       tvm::ffi::make_object<Conv2DIm2ColOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->nhw_step = args[2];
-  node->c_step = args[3];
-  node->kernel = args[4].as<IntImm>().value()->value;
-  node->stride = args[5].as<IntImm>().value()->value;
-  node->dilation = args[6].as<IntImm>().value()->value;
-  node->padding = args[7].as<IntImm>().value()->value;
-  node->eviction_policy = args[8].as<IntImm>().value()->value;
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src_ = node->srcRegion_->buffer;
+  node->dst_ = node->dstRegion_->buffer;
+  node->nhw_step_ = args[2];
+  node->c_step_ = args[3];
+  node->kernel_ = args[4].as<IntImm>().value()->value;
+  node->stride_ = args[5].as<IntImm>().value()->value;
+  node->dilation_ = args[6].as<IntImm>().value()->value;
+  node->padding_ = args[7].as<IntImm>().value()->value;
+  node->eviction_policy_ = args[8].as<IntImm>().value()->value;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a shallow copy of this Conv2DIm2ColOpNode wrapped as a
- * TileOperator.
- *
- * Produces a new Conv2DIm2ColOp that owns a freshly allocated
- * Conv2DIm2ColOpNode initialized from this node (member-wise copy). This is
- * used to duplicate the operator node for compiler passes that require
- * independent operator instances.
- *
- * @return TileOperator A TileOperator containing the cloned Conv2DIm2ColOpNode.
- */
+// Creates a shallow copy of this Conv2DIm2ColOpNode.
 TileOperator Conv2DIm2ColOpNode::Clone() const {
   auto op = tvm::ffi::make_object<Conv2DIm2ColOpNode>(*this);
   return Conv2DIm2ColOp(op);
 }
 
-/**
- * @brief Lower Conv2D im2col into a TMA-backed PTX sequence for Hopper.
- *
- * Constructs a TMA im2col descriptor from the Conv2DIm2ColOp parameters
- * (kernel, stride, dilation, padding, channel/image tiling, dtype and shapes),
- * emits a call to create the im2col descriptor, and returns a statement that
- * invokes the corresponding tma_load_im2col builtin guarded to a single
- * thread. The lowering assumes the destination resides in shared memory and the
- * source in global memory and uses the provided layout information (when
- * available) to select the appropriate shared-memory swizzle.
- *
- * Preconditions (checked with ICHECK):
- * - Target is Hopper.
- * - src.scope() == "global" and dst.scope() is "shared.dyn" or "shared".
- * - src->shape has rank 4 and dst->shape has rank 2.
- * - src and dst have the same dtype.
- * - When a shared layout is supplied it must match a recognized TMA swizzle
- *   pattern (32B/64B/128B) or an ICHECK will fail.
- *
- * @param T Lowering context (target, layout map, thread_var, thread_bounds,
- *          buffer remapping, etc.). Used to fetch target/layout and to emit a
- *          thread-guarded TMA call.
- * @param analyzer Arithmetic analyzer used to prove divisibility and simplify
- *                 expressions required by descriptor construction.
- * @return Stmt A TIR statement that performs a tma_load_im2col call wrapped in
- *              a thread-min guard (IfThenElse). The returned statement is ready
- *              to be inserted into the lowered TIR.
- */
+// Lowers Conv2D im2col into a TMA-backed PTX sequence for Hopper.
 Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   ICHECK(TargetIsHopper(T.target));
-  ICHECK(src.scope() == "global" &&
-         (dst.scope() == "shared.dyn" || dst.scope() == "shared"));
-  ICHECK(src->shape.size() == 4);
-  ICHECK(dst->shape.size() == 2);
-  ICHECK(src->dtype == dst->dtype);
+  ICHECK(src_.scope() == "global" &&
+         (dst_.scope() == "shared.dyn" || dst_.scope() == "shared"));
+  ICHECK(src_->shape.size() == 4);
+  ICHECK(dst_->shape.size() == 2);
+  ICHECK(src_->dtype == dst_->dtype);
   Layout shared_layout;
-  if (T.layout_map.count(dst)) {
-    shared_layout = T.layout_map[dst];
+  if (T.layout_map.count(dst_)) {
+    shared_layout = T.layout_map[dst_];
   }
 
   TMAIm2ColDesc desc;
-  desc.rank = src->shape.size();
-  desc.data_type = to_CUtensorMapDataType(src->dtype);
-  desc.global_addr = src->data;
-  desc.global_shape = ReverseArray(src->shape);
+  desc.rank = src_->shape.size();
+  desc.data_type = to_CUtensorMapDataType(src_->dtype);
+  desc.global_addr = src_->data;
+  desc.global_shape = ReverseArray(src_->shape);
 
-  if (!src->strides.empty()) {
-    desc.global_stride = ReverseArray(src->strides);
+  if (!src_->strides.empty()) {
+    desc.global_stride = ReverseArray(src_->strides);
   } else {
     // Create stride from shape
     PrimExpr stride = 1;
@@ -1814,13 +1585,13 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
   desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * src->dtype.bytes();
+    return cast(DataType::Int(64), e) * src_->dtype.bytes();
   });
-  desc.elem_stride = {1, stride, stride, 1};
-  desc.lower_corner = {-padding, -padding};
-  desc.upper_corner = {-padding, -padding};
-  desc.smem_box_pixel = Downcast<IntImm>(dst->shape[0])->value;
-  desc.smem_box_channel = Downcast<IntImm>(dst->shape[1])->value;
+  desc.elem_stride = {1, stride_, stride_, 1};
+  desc.lower_corner = {-padding_, -padding_};
+  desc.upper_corner = {-padding_, -padding_};
+  desc.smem_box_pixel = Downcast<IntImm>(dst_->shape[0])->value;
+  desc.smem_box_channel = Downcast<IntImm>(dst_->shape[1])->value;
   desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
   desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
@@ -1834,18 +1605,18 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
 
     if (StructuralEqual()(shared_layout,
                           makeQuarterBankSwizzleLayout(*stride, *continuous,
-                                                       dst->dtype.bits()))) {
+                                                       dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
     } else if (StructuralEqual()(shared_layout, makeHalfBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
     } else if (StructuralEqual()(shared_layout, makeFullBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else {
-      ICHECK(0) << "Cannot detect TMA layout.";
+      LOG(FATAL) << "Cannot detect TMA layout.";
     }
   }
 
@@ -1861,57 +1632,50 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
       << "Currently can only support divisible channel case";
 
   global_coords.push_back(
-      FloorMod(c_step * desc.smem_box_channel, desc.global_shape[0]));
+      FloorMod(c_step_ * desc.smem_box_channel, desc.global_shape[0]));
   image_offset.push_back(
-      dilation *
-      FloorMod(FloorDiv(c_step * desc.smem_box_channel, desc.global_shape[0]),
-               kernel));
-  image_offset.push_back(dilation * FloorDiv(c_step * desc.smem_box_channel,
-                                             desc.global_shape[0] * kernel));
+      dilation_ *
+      FloorMod(FloorDiv(c_step_ * desc.smem_box_channel, desc.global_shape[0]),
+               kernel_));
+  image_offset.push_back(dilation_ * FloorDiv(c_step_ * desc.smem_box_channel,
+                                              desc.global_shape[0] * kernel_));
 
   PrimExpr h_dim =
-      FloorDiv(src->shape[1] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[1] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   PrimExpr w_dim =
-      FloorDiv(src->shape[2] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[2] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   global_coords.push_back(
-      stride * FloorMod(nhw_step * desc.smem_box_pixel, w_dim) - padding);
+      stride_ * FloorMod(nhw_step_ * desc.smem_box_pixel, w_dim) - padding_);
   global_coords.push_back(
-      stride *
-          FloorMod(FloorDiv(nhw_step * desc.smem_box_pixel, w_dim), h_dim) -
-      padding);
+      stride_ *
+          FloorMod(FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim), h_dim) -
+      padding_);
   global_coords.push_back(
-      FloorDiv(nhw_step * desc.smem_box_pixel, w_dim * h_dim));
+      FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim * h_dim));
 
   Array<PrimExpr> args;
   args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
   args.push_back(0); // mbar placeholder
-  auto dst_buffer = T.buffer_remap.count(dst) ? T.buffer_remap[dst] : dst;
+  auto dst_buffer = T.buffer_remap.count(dst_) ? T.buffer_remap[dst_] : dst_;
   auto shared_addr = dst_buffer.access_ptr(2);
   args.push_back(shared_addr);
   for (auto coord : global_coords)
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
-  args.push_back(this->eviction_policy);
+  args.push_back(this->eviction_policy_);
   Stmt tma_copy =
       IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
                  Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
   return tma_copy;
 }
 
-/*!
- * \brief Encode the TMA im2col descriptor into an array of PrimExpr.
- * This function serializes the TMA im2col descriptor fields for passing to the
- * create_tma_im2col_descriptor() builtin function. It includes
- * convolution-specific parameters like kernel size, stride, padding, and
- * dilation in addition to standard tensor descriptor fields. \return Array of
- * PrimExpr representing the encoded TMA im2col descriptor.
- */
+// Encodes the TMA im2col descriptor for create_tma_im2col_descriptor().
 Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   Array<PrimExpr> args;
   args.reserve(rank * 5 + 5);
@@ -1939,26 +1703,41 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   return args;
 }
 
+void CopyNode::CollectFragmentLayouts(const PrimExpr &expr,
+                                      const Map<Var, PrimExpr> &let_var_to_expr,
+                                      const LayoutMap &existing_layouts,
+                                      PrimExpr thread_extent,
+                                      Range thread_bounds,
+                                      Map<Buffer, Layout> &result_map) const {
+  PostOrderVisit(expr, [&](const ObjectRef &node) {
+    if (auto bl = node.as<BufferLoadNode>()) {
+      if (IsFragmentBuffer(bl->buffer) && !existing_layouts.count(bl->buffer) &&
+          !result_map.count(bl->buffer)) {
+        auto f = Fragment::FullyReplicated(bl->buffer->shape, thread_extent);
+        result_map.Set(bl->buffer, f->BindThreadRange(thread_bounds));
+      }
+    } else if (auto var_node = node.as<VarNode>()) {
+      auto var = tvm::ffi::GetRef<Var>(var_node);
+      if (let_var_to_expr.count(var)) {
+        CollectFragmentLayouts(let_var_to_expr[var], let_var_to_expr,
+                               existing_layouts, thread_extent, thread_bounds,
+                               result_map);
+      }
+    }
+  });
+}
+
 // Register the Copy operation with TVM's TIR system
 // This makes the copy operation available for use in TVM programs
 // - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
 // eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Copy, copy)
+TIR_REGISTER_TL_TILE_OP(Copy, copy)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-/**
- * @brief Layout inference hook for Conv2DIm2ColOpNode.
- *
- * This operator does not provide any layout inference; the function
- * intentionally returns an empty LayoutMap to indicate no layout suggestions.
- *
- * @param T Context for layout inference (ignored).
- * @param level Inference level (ignored).
- * @return LayoutMap An empty map.
- */
+// Layout inference hook - returns empty map (no layout suggestions).
 LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
                                           InferLevel level) const {
   return {};
@@ -1969,7 +1748,7 @@ LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
 // - Takes 9 inputs: src_buffer, dst_buffer, nhw_step, c_step, kernel, stride,
 // dilation, padding, eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
+TIR_REGISTER_TL_TILE_OP(Conv2DIm2ColOp, c2d_im2col)
     .set_num_inputs(9)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/copy.h b/src/op/copy.h
index ef46b9edb..3fe1b813c 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -28,6 +28,32 @@ enum class CopyInst : uint8_t {
   kTMemStore = 8,   // tcgen05.st (register -> tensor memory)
 };
 
+/// Convert CopyInst enum to string for debugging
+inline const char *CopyInstToString(CopyInst inst) {
+  switch (inst) {
+  case CopyInst::kNormal:
+    return "Normal";
+  case CopyInst::kLDSM:
+    return "LDSM";
+  case CopyInst::kSTSM:
+    return "STSM";
+  case CopyInst::kBulkLoad:
+    return "BulkLoad";
+  case CopyInst::kBulkStore:
+    return "BulkStore";
+  case CopyInst::kBulkLoad1D:
+    return "BulkLoad1D";
+  case CopyInst::kBulkStore1D:
+    return "BulkStore1D";
+  case CopyInst::kTMemLoad:
+    return "TMemLoad";
+  case CopyInst::kTMemStore:
+    return "TMemStore";
+  default:
+    return "Unknown";
+  }
+}
+
 /// Descriptor for Tensor Memory Access (TMA) copy operations
 struct TMADesc {
   size_t rank;                   ///< Tensor rank (number of dimensions)
@@ -89,18 +115,18 @@ class CopyNode : public TileOperatorNode {
 public:
   Buffer src, dst;                   // Source and destination buffers
   Array<Range> src_range, dst_range; // Ranges for each dimension in src and dst
-  IntImm coalesced_width; // Width (in elements) for coalesced memory access
-  Bool disable_tma = Bool(false); // Whether to disable TMA acceleration
+  Map<String, ObjectRef> annotations; // Annotations for the copy operation
+  // Supported annotation keys:
+  //   - "coalesced_width": IntImm, width for coalesced memory access
+  //   - "disable_tma": Bool, whether to disable TMA acceleration
+  //   - "eviction_policy": IntImm, cache eviction policy (0=normal, 1=first,
+  //   2=last)
+  //   - attr::kParallelLoopLayout ("parallel_loop_layout"): Fragment, loop
+  //     layout hint applied to the outermost generated parallel loop of this
+  //     copy's SIMT loop nest.
 
   mutable ParallelOp par_op_; // Optional associated parallelization operator
 
-  enum class EvictionPolicy : uint8_t {
-    kEvictNormal = 0,
-    kEvictFirst = 1,
-    kEvictLast = 2,
-  };
-
-  uint8_t eviction_policy; // Policy for cache eviction
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Copy", CopyNode, TileOperatorNode);
 
   static void RegisterReflection() {
@@ -110,7 +136,26 @@ class CopyNode : public TileOperatorNode {
         .def_ro("dst", &CopyNode::dst)
         .def_ro("src_range", &CopyNode::src_range)
         .def_ro("dst_range", &CopyNode::dst_range)
-        .def_ro("coalesced_width", &CopyNode::coalesced_width);
+        .def_ro("annotations", &CopyNode::annotations);
+  }
+
+  // Helper methods to get annotation values
+  bool GetDisableTMA() const {
+    if (auto val = annotations.Get("disable_tma")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
+  }
+
+  int GetEvictionPolicy() const {
+    if (auto val = annotations.Get("eviction_policy")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value;
+      }
+    }
+    return 0; // default: evict_normal
   }
 
   /*!
@@ -269,6 +314,28 @@ class CopyNode : public TileOperatorNode {
    * @return Reference to the singleton TVM Op representing this operator.
    */
   TileOperator Clone() const;
+
+private:
+  /*!
+   * \brief Collect fragment buffers from expression and create fully replicated
+   * layouts.
+   *
+   * Recursively searches the expression for BufferLoad nodes with
+   * "local.fragment" scope, following let bindings. For each found fragment
+   * buffer, creates a fully replicated layout and adds it to result_map.
+   *
+   * \param expr            Expression to search.
+   * \param let_var_to_expr Map from let variables to their bound expressions.
+   * \param existing_layouts Existing layout map to check for already-inferred
+   * layouts. \param thread_extent   Number of threads for replication. \param
+   * thread_bounds   Thread bounds for binding the layout. \param result_map
+   * Output map to store collected fragment layouts.
+   */
+  void CollectFragmentLayouts(const PrimExpr &expr,
+                              const Map<Var, PrimExpr> &let_var_to_expr,
+                              const LayoutMap &existing_layouts,
+                              PrimExpr thread_extent, Range thread_bounds,
+                              Map<Buffer, Layout> &result_map) const;
 };
 
 class Copy : public TileOperator {
@@ -278,9 +345,10 @@ class Copy : public TileOperator {
   /*!
    * \brief Constructor.
    * \param args  Expression arguments for the copy.
-   * \param vmap  Buffer variable mapping.
+   * \param annotations  Annotations map from the Call node.
    */
-  TVM_DLL Copy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Copy(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
 
   /*!
    * \brief Get the TVM Op handle corresponding to this Copy op.
@@ -296,14 +364,16 @@ class Copy : public TileOperator {
  */
 class Conv2DIm2ColOpNode : public TileOperatorNode {
 public:
-  Buffer src, dst; // Source (input feature map) and destination (im2col matrix)
-  int stride;      // Stride for convolution
-  int padding;     // Padding amount
-  int dilation;    // Dilation factor
-  int kernel;      // Kernel size
-  int eviction_policy; // Cache eviction policy
-  PrimExpr nhw_step;   // Step size in NHW dimensions
-  PrimExpr c_step;     // Step size in channel dimension
+  BufferRegion srcRegion_, dstRegion_;
+  Buffer src_,
+      dst_;      // Source (input feature map) and destination (im2col matrix)
+  int stride_;   // Stride for convolution
+  int padding_;  // Padding amount
+  int dilation_; // Dilation factor
+  int kernel_;   // Kernel size
+  int eviction_policy_; // Cache eviction policy
+  PrimExpr nhw_step_;   // Step size in NHW dimensions
+  PrimExpr c_step_;     // Step size in channel dimension
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Conv2DIm2Col", Conv2DIm2ColOpNode,
                                     TileOperatorNode);
@@ -311,13 +381,15 @@ class Conv2DIm2ColOpNode : public TileOperatorNode {
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<Conv2DIm2ColOpNode>()
-        .def_ro("src", &Conv2DIm2ColOpNode::src)
-        .def_ro("dst", &Conv2DIm2ColOpNode::dst)
-        .def_ro("stride", &Conv2DIm2ColOpNode::stride)
-        .def_ro("padding", &Conv2DIm2ColOpNode::padding)
-        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation)
-        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel)
-        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy);
+        .def_ro("srcRegion", &Conv2DIm2ColOpNode::srcRegion_)
+        .def_ro("dstRegion", &Conv2DIm2ColOpNode::dstRegion_)
+        .def_ro("src", &Conv2DIm2ColOpNode::src_)
+        .def_ro("dst", &Conv2DIm2ColOpNode::dst_)
+        .def_ro("stride", &Conv2DIm2ColOpNode::stride_)
+        .def_ro("padding", &Conv2DIm2ColOpNode::padding_)
+        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation_)
+        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel_)
+        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy_);
   }
 
   /*!
@@ -342,7 +414,9 @@ class Conv2DIm2ColOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Conv2DIm2ColOp, TileOperator,
                                              Conv2DIm2ColOpNode);
-  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL
+  Conv2DIm2ColOp(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 83b0842dc..feab45b56 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -13,11 +13,10 @@
 #include "../layout/tcgen05_layout.h"
 #include "../target/utils.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
 #include "builtin.h"
-#include "region.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -52,62 +51,18 @@ using namespace tir;
  * value].
  *             - args[0]: destination access (BufferLoad or pointer expression).
  *             - args[1]: value to fill (scalar or vector).
- * @param vmap Mapping from buffer variables to Buffer objects; used to resolve
- * the destination when args[0] is not a BufferLoad.
  *
  * Notes:
  * - The constructor enforces constraints (e.g., stride == 1 ramps, constant
  * lanes) and will terminate (via CHECK/ICHECK) if inputs are unsupported or out
  * of bounds.
  */
-Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
+Fill::Fill(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<FillNode> node = tvm::ffi::make_object<FillNode>();
 
-  // Case 1: Region descriptor call (tl.region)
-  if (const auto *call = args[0].as<CallNode>()) {
-    if (call->op.same_as(RegionOp::Get())) {
-      auto region = RegionOp(call->args, vmap);
-      node->dst = region->GetBuffer();
-      node->region = region->GetRanges();
-    } else if (call->op.same_as(builtin::tvm_access_ptr())) {
-      node->dst = vmap[GetVarFromAccessPtr(args[0])];
-      for (int i = 0; i < node->dst->shape.size(); i++) {
-        node->region.push_back(Range(0, node->dst->shape[i]));
-      }
-    } else {
-      ICHECK(false) << "Unsupported call op in tl.fill: "
-                    << Downcast<Op>(call->op)->name;
-    }
-
-    // Case 2: Explicit BufferRegion (legacy path)
-  } else if (args[0]->IsInstance<BufferRegionNode>()) {
-    auto region = Downcast<BufferRegion>(args[0]);
-    node->dst = region->buffer;
-    node->region = region->region;
-
-    // Case 3: Vector/scalar region expressed via BufferLoad indices
-  } else if (args[0]->IsInstance<BufferLoadNode>()) {
-    auto buffer_load = Downcast<BufferLoad>(args[0]);
-    for (const auto &index : buffer_load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        CHECK(ramp->stride.as<IntImmNode>()->value == 1)
-            << "Only stride 1 ramps are supported";
-        const auto *lanes = ramp->lanes.as<IntImmNode>();
-        CHECK(lanes)
-            << "Scalable vectors not supported in BufferRegion conversion";
-        node->region.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        node->region.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    node->dst = buffer_load->buffer;
-    // Case 4: Access pointer, fill the full buffer
-  } else {
-    node->dst = vmap[GetVarFromAccessPtr(args[0])];
-    for (int i = 0; i < node->dst->shape.size(); i++) {
-      node->region.push_back(Range(0, node->dst->shape[i]));
-    }
-  }
+  BufferRegion region = NormalizeToBufferRegion(args[0]);
+  node->dst = region->buffer;
+  node->region = region->region;
 
   if (args[1]->dtype != node->dst->dtype) {
     node->value = Cast(node->dst->dtype, args[1]);
@@ -200,32 +155,44 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
  * @return Stmt The lowered TIR statement implementing the fill.
  */
 Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  if (dst.scope() == "local.fragment") {
+  if (IsFragmentBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop =
+        VectorizeLoop(thread_loop, analyzer, T.layout_map);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
     }
     return vectorized_thread_loop;
-  } else if (dst.scope() == "local") {
+  } else if (IsLocalBuffer(dst) || IsLocalVarBuffer(dst)) {
     auto init_loop = MakeSIMTLoop(analyzer);
-    auto vectorized_thread_loop = VectorizeLoop(init_loop);
+    auto vectorized_thread_loop =
+        VectorizeLoop(init_loop, analyzer, T.layout_map);
     return vectorized_thread_loop;
-  } else if (dst.scope() == "shared.dyn" || dst.scope() == "shared" ||
-             dst.scope() == "global") {
+  } else if (IsSharedBuffer(dst) || IsGlobalBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop =
+        VectorizeLoop(thread_loop, analyzer, T.layout_map);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
@@ -253,7 +220,7 @@ LayoutMap FillNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(Fill, fill)
+TIR_REGISTER_TL_TILE_OP(Fill, fill)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/fill.h b/src/op/fill.h
index 8f1dd9006..b5734ad56 100644
--- a/src/op/fill.h
+++ b/src/op/fill.h
@@ -45,7 +45,8 @@ class FillNode : public TileOperatorNode {
 class Fill : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fill, TileOperator, FillNode);
-  TVM_DLL Fill(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Fill(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index 84b18897b..e9e2fca54 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/op_attr_types.h>
 
 #include "../target/utils.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -29,12 +30,15 @@ using namespace tir;
  * @param args TL operator arguments: expects at least two elements where
  *             `args[0]` is an access pointer identifying the reducer variable
  * and `args[1]` is an integer encoding a `ReducerOpType` (e.g., Sum/Max/Min).
- * @param vmap Mapping from variables to Buffers used to look up the reducer
- * Buffer.
  */
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args,
+                                     Map<String, ObjectRef> annotations) {
   auto node = tvm::ffi::make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
+  // Normalize any supported region expression
+  // (BufferRegion/BufferLoad/tl.region) to a BufferRegion, then take the
+  // underlying Buffer as reducer.
+  auto region = NormalizeToBufferRegion(args[0]);
+  node->reducer = region->buffer;
   node->op = (ReducerOpType)*as_const_int(args[1]);
   data_ = std::move(node);
 }
@@ -95,7 +99,8 @@ Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
   int reducing_threads = extent;
   std::stringstream ss;
   auto thread_offset = T.thread_bounds->min;
-  if (TargetIsHopper(T.target) || TargetIsSm100(T.target)) {
+  if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
+      TargetIsSM120(T.target)) {
     auto all_threads = T.thread_bounds->extent;
     ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
        << ", " << thread_offset << ", " << all_threads << ">::run_hopper";
@@ -156,7 +161,7 @@ TileOperator FinalizeReducerOpNode::Clone() const {
   return TileOperator(node);
 }
 
-TIR_REGISTER_TL_OP(FinalizeReducerOp, finalize_reducer)
+TIR_REGISTER_TL_TILE_OP(FinalizeReducerOp, finalize_reducer)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
index ef49ee194..a3903ed14 100644
--- a/src/op/finalize_reducer.h
+++ b/src/op/finalize_reducer.h
@@ -48,7 +48,9 @@ class FinalizeReducerOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FinalizeReducerOp, TileOperator,
                                              FinalizeReducerOpNode);
-  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL FinalizeReducerOp(
+      Array<PrimExpr> args,
+      Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 48e6cdf6e..24836f5ba 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -12,8 +12,8 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -41,106 +41,21 @@ using namespace tir;
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap[var];
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
-
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
-
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
-Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
+Gemm::Gemm(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -165,11 +80,14 @@ Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    } else {
+      node->mbar_ = std::nullopt;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
@@ -227,6 +145,8 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
     int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
   int num_warps = block_size / TargetGetWarpSize(target);
   if (gemm_inst == GemmInst::kTCGEN5MMA) {
+    this->m_warp = 1;
+    this->n_warp = num_warps;
     return {1, num_warps}; // TCGEN5MMA doesn't care about warp partitioning
   }
 
@@ -235,6 +155,8 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
   int kNPerWarp = 8;            // Columns processed by a single warp
   if (TargetIsVolta(target)) {
     kNPerWarp = 16;
+  } else if (TargetIsCDNA(target)) {
+    kNPerWarp = 16;
   }
   ICHECK(M % kMPerWarp == 0)
       << "M must be divisible by " << kMPerWarp << ", but got " << M;
@@ -443,13 +365,7 @@ bool GemmNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -462,13 +378,7 @@ bool GemmNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -535,9 +445,12 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
 
   // Build access pointers from regions locally
-  PrimExpr Aptr = MakeAccessPtrFromRegion(aRegion_, /*r*/ 1);
-  PrimExpr Bptr = MakeAccessPtrFromRegion(bRegion_, /*r*/ 1);
-  PrimExpr Cptr = MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3);
+  PrimExpr Aptr =
+      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Bptr =
+      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Cptr =
+      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
 
   std::stringstream ss;
   std::string op_name;
@@ -579,11 +492,13 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
     auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
     Array<PrimExpr> new_args;
+    auto mbarPtr =
+        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
     new_args.push_back(StringImm(ss.str()));
     new_args.push_back(Aptr);
     new_args.push_back(Bptr);
     new_args.push_back(BufferLoad(C_buffer, cCoords_));
-    new_args.push_back(mbarPtr_);
+    new_args.push_back(mbarPtr);
     new_args.push_back(clearAccum_);
     auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
 
@@ -609,17 +524,17 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     }
   }
 
-  if (a_.scope() == "local.fragment") {
-    ICHECK(b_.scope() != "local.fragment");
+  if (IsFragmentBuffer(a_)) {
+    ICHECK(!IsFragmentBuffer(b_));
     ICHECK(!transA_)
         << "gemm_rs requires the A operand to be in non-transposed layout.";
     op_name = "tl::gemm_rs";
-  } else if (b_.scope() == "local.fragment") {
+  } else if (IsFragmentBuffer(b_)) {
     op_name = "tl::gemm_sr";
   } else {
     op_name = "tl::gemm_ss";
   }
-  ICHECK(c_.scope() == "local.fragment");
+  ICHECK(IsFragmentBuffer(c_));
 
   ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
   ss << warp_m << ", " << warp_n << ", ";
@@ -691,7 +606,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   auto [warp_m, warp_n] =
       policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
   if (TargetIsVolta(T.target)) {
-    ICHECK(c_.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << "Volta gemm only supports C in local.fragment scope, got "
         << c_.scope();
     auto fragment = makeGemmVoltaFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
@@ -702,7 +617,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       results.Set(a_, makeGemmVoltaABLayout(*as_const_int(a_->shape[dim_A - 2]),
                                             *as_const_int(a_->shape[dim_A - 1]),
                                             true, !transA_));
-    } else if (a_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(a_)) {
       ICHECK(transA_ == false);
       auto fragment =
           makeGemmVoltaFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n);
@@ -719,7 +634,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
              TargetIsSM120(T.target) ||
              (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
-    ICHECK(c_.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << "MMA only supports C in local.fragment scope, got " << c_.scope();
 
     auto fragment =
@@ -733,7 +648,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       results.Set(a_,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
                                    a_->dtype.bits(), !transA_));
-    } else if (a_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(a_)) {
       auto fragment = makeGemmFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
                                         a_->dtype.bits(), transA_);
       results.Set(a_, fragment->BindThreadRange(thread_range));
@@ -747,7 +662,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       results.Set(b_,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
                                    b_->dtype.bits(), transB_));
-    } else if (b_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(b_)) {
       auto fragment =
           makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
       results.Set(b_, fragment->BindThreadRange(thread_range));
@@ -755,7 +670,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       ICHECK(0);
     }
   } else if (TargetIsHopper(T.target)) {
-    ICHECK(c_.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
         << "only supports C in local.fragment scope, got " << c_.scope();
     auto fragment = gemm_inst == GemmInst::kWGMMA
@@ -861,7 +776,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       results.Set(c_, res);
     }
   } else if (TargetIsCDNA(T.target)) {
-    ICHECK(c_.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
         << c_.scope();
     auto fragment = makeGemmFragmentCCDNA(m_, n_, m_ / warp_m, n_ / warp_n,
@@ -874,7 +789,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
           *as_const_int(a_->shape[dim_A - 2]),
           *as_const_int(a_->shape[dim_A - 1]), a_->dtype.bits(), kPack_);
       results.Set(a_, shared_layout);
-    } else if (a_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(a_)) {
       auto fragment =
           makeGemmFragmentACDNA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
                                 a_->dtype.bits(), kPack_, transA_);
@@ -889,7 +804,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
           *as_const_int(b_->shape[dim_B - 1]), b_->dtype.bits(), kPack_);
 
       results.Set(b_, shared_layout);
-    } else if (b_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(b_)) {
       auto fragment =
           makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
       results.Set(b_, fragment->BindThreadRange(thread_range));
@@ -903,7 +818,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(Gemm, gemm)
+TIR_REGISTER_TL_TILE_OP(Gemm, gemm)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/gemm.h b/src/op/gemm.h
index 1c9760550..fd2733882 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -22,8 +22,41 @@ enum class GemmWarpPolicyType : uint8_t {
   kFree = 3,
 };
 
+/// Convert GemmWarpPolicyType enum to string for debugging
+inline const char *GemmWarpPolicyTypeToString(GemmWarpPolicyType type) {
+  switch (type) {
+  case GemmWarpPolicyType::kSquare:
+    return "Square";
+  case GemmWarpPolicyType::kFullRow:
+    return "FullRow";
+  case GemmWarpPolicyType::kFullCol:
+    return "FullCol";
+  case GemmWarpPolicyType::kFree:
+    return "Free";
+  default:
+    return "Unknown";
+  }
+}
+
 // Target GEMM instruction
 enum class GemmInst : uint8_t { kMMA, kWGMMA, kTCGEN5MMA, kMFMA };
+
+/// Convert GemmInst enum to string for debugging
+inline const char *GemmInstToString(GemmInst inst) {
+  switch (inst) {
+  case GemmInst::kMMA:
+    return "MMA";
+  case GemmInst::kWGMMA:
+    return "WGMMA";
+  case GemmInst::kTCGEN5MMA:
+    return "TCGEN5MMA";
+  case GemmInst::kMFMA:
+    return "MFMA";
+  default:
+    return "Unknown";
+  }
+}
+
 class GemmWarpPolicyNode : public Object {
 public:
   mutable int m_warp{0};
@@ -97,7 +130,7 @@ class GemmNode : public TileOperatorNode {
   // only will be enabled under cdna mfma instructions
   int kPack_ = 1;
   int wgWait_ = 0;
-  PrimExpr mbarPtr_;
+  BufferRegion mbarRegion_;
   std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   mutable GemmWarpPolicy policy_;
@@ -144,7 +177,8 @@ class GemmNode : public TileOperatorNode {
 class Gemm : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Gemm, TileOperator, GemmNode);
-  TVM_DLL Gemm(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Gemm(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index ac506ee09..3d017538b 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -12,100 +12,17 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap.at(var);
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
-
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
-
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
-
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
 /**
  * @brief Construct a Gemm operator from serialized TL arguments and a buffer
@@ -128,19 +45,17 @@ static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
+GemmPy::GemmPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmPyNode> node = tvm::ffi::make_object<GemmPyNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -165,11 +80,12 @@ GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
@@ -217,9 +133,7 @@ GemmInst GemmPyNode::getGemmInst(int block_size, Target target) const {
     return GemmInst::kWGMMA;
   } else if (TargetIsCDNA(target)) {
     return GemmInst::kMFMA;
-  } else if (TargetIsVolta(target) || TargetIsAmpere(target) ||
-             TargetIsTuring(target) || TargetIsHopper(target) ||
-             TargetIsSm100(target)) {
+  } else if (TargetIsCuda(target)) {
     return GemmInst::kMMA;
   } else {
     ICHECK(0) << "Unsupported target for gemm: " << target->str();
@@ -266,13 +180,7 @@ bool GemmPyNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -285,13 +193,7 @@ bool GemmPyNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -339,13 +241,8 @@ static int GetArchInt(Target target) {
 }
 
 Stmt GemmPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  auto block_size = *as_const_int(T.thread_bounds->extent);
-  GemmInst gemm_inst = getGemmInst(block_size, T.target);
-
-  auto [warp_m, warp_n] =
-      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
-
   if (const auto f = ffi::Function::GetGlobal("tl.gemm_py.lower")) {
+    // NOTE(wt): Decide GemmInst and compute warp partition on Python side
     auto prim_func =
         Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmPy>(this), T.layout_map,
                                 T.target, T.thread_bounds, T.thread_var));
@@ -402,7 +299,7 @@ LayoutMap GemmPyNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmPy, gemm_py)
+TIR_REGISTER_TL_TILE_OP(GemmPy, gemm_py)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -428,6 +325,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           result.push_back(Integer(meta.atom_m));
           result.push_back(Integer(meta.atom_n));
           result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
         }
         return result;
       });
diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
index 0678588e8..d6468a0bf 100644
--- a/src/op/gemm_py.h
+++ b/src/op/gemm_py.h
@@ -29,8 +29,8 @@ class GemmPyNode : public TileOperatorNode {
   int strideA_, strideB_;
   int offsetA_, offsetB_;
   PrimExpr clearAccum_ = const_false();
-  PrimExpr mbarPtr_;
-  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
+  BufferRegion mbarRegion_;
+  tir::Buffer mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
@@ -59,7 +59,8 @@ class GemmPyNode : public TileOperatorNode {
         .def_ro("offsetA", &GemmPyNode::offsetA_)
         .def_ro("offsetB", &GemmPyNode::offsetB_)
         .def_ro("clearAccum", &GemmPyNode::clearAccum_)
-        .def_ro("mbarPtr", &GemmPyNode::mbarPtr_)
+        .def_ro("mbarRegion", &GemmPyNode::mbarRegion_)
+        .def_ro("mbar", &GemmPyNode::mbar_)
         .def_ro("cCoords", &GemmPyNode::cCoords_)
         .def_ro("kPack", &GemmPyNode::kPack_)
         .def_ro("wgWait", &GemmPyNode::wgWait_)
@@ -82,7 +83,8 @@ class GemmPyNode : public TileOperatorNode {
 class GemmPy : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmPy, TileOperator, GemmPyNode);
-  TVM_DLL GemmPy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmPy(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index 52a119e03..656ae887d 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -14,19 +14,21 @@
 #include "../target/utils.h"
 #include "builtin.h"
 #include "gemm.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
-std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
-                                                               int block_size,
-                                                               Target target,
-                                                               bool use_wgmma,
-                                                               int bits) const {
+std::pair<int, int>
+GemmSPWarpPolicyNode::computeWarpPartition(int M, int N, int block_size,
+                                           Target target, GemmInst gemm_inst,
+                                           int bits) const {
   int num_warps = block_size / TargetGetWarpSize(target);
 
+  ICHECK(gemm_inst == GemmInst::kMMA || gemm_inst == GemmInst::kWGMMA)
+      << "GemmSP currently only supports MMA and WGMMA";
   auto [m_warp, n_warp] = GemmWarpPolicyNode::computeWarpPartition(
-      M, N, block_size, target, use_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA);
+      M, N, block_size, target, gemm_inst);
 
   // Special handling for gemm_sp when the tiling size is not a multiple
   // This should be consistent with shape check in gemm_sp_sm80.h
@@ -79,16 +81,19 @@ std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
  * The populated GemmSPNode is stored in the instance's internal data_ pointer.
  *
  * @param args Positional TL call arguments in the above order.
- * @param vmap BufferMap mapping access pointers (from args) to Buffer objects.
  *
  * @note An ICHECK failure is raised if a provided kPack is not 1 or 2.
  */
-GemmSP::GemmSP(Array<PrimExpr> args, BufferMap vmap) {
+GemmSP::GemmSP(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmSPNode> node = tvm::ffi::make_object<GemmSPNode>();
-  node->a_ = vmap[GetVarFromAccessPtr(args[0])];
-  node->e_ = vmap[GetVarFromAccessPtr(args[1])];
-  node->b_ = vmap[GetVarFromAccessPtr(args[2])];
-  node->c_ = vmap[GetVarFromAccessPtr(args[3])];
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  node->a_ = node->aRegion_->buffer;
+  node->e_ = node->eRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
   node->transA_ = args[4].as<Bool>().value();
   node->transB_ = args[5].as<Bool>().value();
   node->m_ = args[6].as<IntImm>().value()->value;
@@ -146,9 +151,9 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto block_size = *as_const_int(T.thread_bounds->extent);
   bool maybe_wgmma = TargetIsHopper(T.target) && (this->m_ >= 64) &&
                      (block_size / warp_size % 4 == 0);
-
+  auto gemm_inst = maybe_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA;
   auto [warp_m, warp_n] = policy_->computeWarpPartition(
-      m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
+      m_, n_, block_size, T.target, gemm_inst, a_->dtype.bits());
 
   std::stringstream ss;
   std::string op_name = "tl::gemm_sp_ss";
@@ -217,7 +222,7 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   if (completed_)
     return {};
   LayoutMap results;
-  ICHECK(c_.scope() == "local.fragment");
+  ICHECK(IsFragmentBuffer(c_));
   auto thread_range = T.thread_bounds;
   auto block_size = *as_const_int(thread_range->extent);
   if (TargetIsHopper(T.target)) {
@@ -225,8 +230,9 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
     constexpr int wgmma_m = 16 * 4;
     bool maybe_wgmma =
         (this->m_ >= wgmma_m) && (block_size / warp_size % 4 == 0);
+    auto gemm_inst = maybe_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA;
     auto [warp_m, warp_n] = policy_->computeWarpPartition(
-        m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
+        m_, n_, block_size, T.target, gemm_inst, a_->dtype.bits());
     auto fragment = maybe_wgmma
                         ? makeGemmFragmentCHopper(m_, n_, m_ / warp_m,
                                                   n_ / warp_n, c_->dtype.bits())
@@ -258,7 +264,7 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
     }
   } else if (TargetIsAmpere(T.target)) {
     auto [warp_m, warp_n] = policy_->computeWarpPartition(
-        m_, n_, block_size, T.target, false, a_->dtype.bits());
+        m_, n_, block_size, T.target, GemmInst::kMMA, a_->dtype.bits());
     auto fragment = makeGemmSparseFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
                                             c_->dtype.bits());
     results.Set(c_, fragment->BindThreadRange(thread_range));
@@ -269,7 +275,7 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
       const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
       results.Set(a_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
                                                    a_->dtype.bits()));
-    } else if (a_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(a_)) {
       // auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
       //                                   A->dtype.bits(), trans_A);
       // results.Set(A, fragment->BindThreadRange(thread_range));
@@ -283,7 +289,7 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
       const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
       results.Set(b_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
                                                    b_->dtype.bits()));
-    } else if (b_.scope() == "local.fragment") {
+    } else if (IsFragmentBuffer(b_)) {
       // auto fragment =
       //     makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
       // results.Set(B, fragment->BindThreadRange(thread_range));
@@ -298,12 +304,25 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmSP, gemm_sp)
+TIR_REGISTER_TL_TILE_OP(GemmSP, gemm_sp)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK() { GemmSPNode::RegisterReflection(); }
+TVM_REGISTER_OP("tl.GemmSPWarpPolicy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmSPWarpPolicy");
 
+TVM_FFI_STATIC_INIT_BLOCK() {
+  GemmSPNode::RegisterReflection();
+  GemmSPWarpPolicyNode::RegisterReflection();
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.GemmSPWarpPolicyComputeWarpPartition",
+      [](GemmSPWarpPolicy policy, int M, int N, int block_size, Target target,
+         GemmInst gemm_inst, int bits) {
+        policy->computeWarpPartition(M, N, block_size, target, gemm_inst, bits);
+        return;
+      });
+}
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index 1eb535a53..8fb2db770 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -19,10 +19,18 @@ using namespace tir;
 class GemmSPWarpPolicyNode : public GemmWarpPolicyNode {
 public:
   std::pair<int, int> computeWarpPartition(int M, int N, int block_size,
-                                           Target target, bool use_wgmma,
+                                           Target target, GemmInst gemm_inst,
                                            int bits) const;
   TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmSPWarpPolicy", GemmSPWarpPolicyNode,
                               GemmWarpPolicyNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPWarpPolicyNode>()
+        .def_ro("policy_type", &GemmSPWarpPolicyNode::policy_type)
+        .def_ro("m_warp", &GemmSPWarpPolicyNode::m_warp)
+        .def_ro("n_warp", &GemmSPWarpPolicyNode::n_warp);
+  }
 };
 
 class GemmSPWarpPolicy : public ObjectRef {
@@ -53,6 +61,7 @@ class GemmSPWarpPolicy : public ObjectRef {
 
 class GemmSPNode : public TileOperatorNode {
 public:
+  BufferRegion aRegion_, bRegion_, cRegion_, eRegion_;
   tir::Buffer a_, b_, c_, e_;
   bool transA_, transB_;
   int m_, n_, k_;
@@ -75,6 +84,10 @@ class GemmSPNode : public TileOperatorNode {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmSPNode>()
         .def_ro("policy", &GemmSPNode::policy_)
+        .def_ro("aRegion", &GemmSPNode::aRegion_)
+        .def_ro("bRegion", &GemmSPNode::bRegion_)
+        .def_ro("cRegion", &GemmSPNode::cRegion_)
+        .def_ro("eRegion", &GemmSPNode::eRegion_)
         .def_ro("a", &GemmSPNode::a_)
         .def_ro("b", &GemmSPNode::b_)
         .def_ro("c", &GemmSPNode::c_)
@@ -96,7 +109,8 @@ class GemmSPNode : public TileOperatorNode {
 class GemmSP : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSP, TileOperator, GemmSPNode);
-  TVM_DLL GemmSP(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmSP(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
new file mode 100644
index 000000000..5ab837277
--- /dev/null
+++ b/src/op/gemm_sp_py.cc
@@ -0,0 +1,283 @@
+/*!
+ * \file tl/op/gemm_sp_py.cc
+ * \brief Implementation of Sparse General Matrix Multiplication (GEMM_SP)
+ * operators
+ */
+
+#include "gemm_sp_py.h"
+#include "utils.h"
+
+#include "builtin.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/transform.h>
+
+#include "../target/utils.h"
+#include "tvm/ffi/string.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/**
+ * @brief Construct a Gemm operator from serialized TL arguments and a buffer
+ * map.
+ *
+ * This constructor deserializes operator parameters from `args` and resolves
+ * buffer references via `vmap`, populating an internal GemmSPPyNode with:
+ * - device pointers for A, E, B, C and their corresponding Buffer objects,
+ * - transpose flags for A and B,
+ * - matrix dimensions M, N, K,
+ * - warp allocation policy and clear_accum flag,
+ * - strides and memory offsets for A and B,
+ * - optional kPack (must be 1 or 2) and optional wg_wait.
+ *
+ * The populated GemmSPPyNode is stored into the wrapper's internal `data_`.
+ *
+ * @param args Positional serialized arguments produced by the TL frontend:
+ *   expected layout is:
+ *     [Aptr, Eptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
+ *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
+ *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
+ *      (optional) kPack (Int), (optional) wg_wait (Int)]
+ * @param vmap Mapping from access pointer vars to Buffer objects used to
+ *   resolve the Buffer corresponding to each pointer argument.
+ *
+ * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
+ *       fails with an ICHECK (runtime assertion). No other validation is
+ *       performed here.
+ */
+GemmSPPy::GemmSPPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GemmSPPyNode> node = tvm::ffi::make_object<GemmSPPyNode>();
+
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+
+  node->A = node->aRegion_->buffer;
+  node->E = node->eRegion_->buffer;
+  node->B = node->bRegion_->buffer;
+  node->C = node->cRegion_->buffer;
+
+  node->trans_A = args[4].as<Bool>().value();
+  node->trans_B = args[5].as<Bool>().value();
+  node->trans_E = args[6].as<Bool>().value();
+  node->M = args[7].as<IntImm>().value()->value;
+  node->N = args[8].as<IntImm>().value()->value;
+  node->K = args[9].as<IntImm>().value()->value;
+  node->policy = GemmWarpPolicy(args[10].as<IntImm>().value()->value);
+  node->clear_accum = args[11].as<PrimExpr>().value();
+  node->stride_A = args[12].as<IntImm>().value()->value;
+  node->stride_B = args[13].as<IntImm>().value()->value;
+  node->offset_A = args[14].as<IntImm>().value()->value;
+  node->offset_B = args[15].as<IntImm>().value()->value;
+  if (args.size() > 16) {
+    node->kPack = args[16].as<IntImm>().value()->value;
+    if (node->kPack != 1 && node->kPack != 2) {
+      ICHECK(false) << "kPack must be 1 or 2";
+    }
+  }
+  if (args.size() > 17) {
+    node->wg_wait = args[17].as<IntImm>().value()->value;
+  }
+  data_ = std::move(node);
+}
+
+/**
+ * @brief Create a copy of this GemmSPPyNode as a TileOperator.
+ *
+ * Constructs a new GemmSPPyNode by copying the current node state and returns
+ * it wrapped in a GemmSPPy TileOperator.
+ *
+ * @return TileOperator A GemmSPPy operator that owns a copy of this node.
+ */
+TileOperator GemmSPPyNode::Clone() const {
+  auto op = tvm::ffi::make_object<GemmSPPyNode>(*this);
+  return GemmSPPy(op);
+}
+
+GemmInst GemmSPPyNode::GetGemmInst(int block_size, Target target) const {
+  int warp_size = TargetGetWarpSize(target);
+  int num_warps = block_size / warp_size;
+  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
+                     (num_warps % 4 == 0) && CheckWGMMA();
+  if (allow_wgmma) {
+    return GemmInst::kWGMMA;
+  } else if (TargetIsCDNA(target)) {
+    return GemmInst::kMFMA;
+  } else if (TargetIsCuda(target)) {
+    return GemmInst::kMMA;
+  } else {
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+  }
+}
+
+/**
+ * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
+ *
+ * Evaluates device-memory placement, data-type combinations, transpose flags,
+ * and K divisibility constraints required for the Hopper WGMMA code path.
+ *
+ * The check returns true only when:
+ * - B resides in shared memory ("shared" or "shared.dyn"); and
+ * - (C, A, B) dtypes match one of the supported combinations below and K
+ *   satisfies the required alignment; and
+ * - for combinations that require specific orientations, A is not transposed
+ *   and B is transposed.
+ *
+ * Supported combinations and constraints:
+ * - C=float16:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
+ * 32 == 0
+ * - C=float32:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - A=bfloat16, B=bfloat16: K % 16 == 0
+ *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
+ *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
+ * - C=int32:
+ *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
+ * and K % 32 == 0
+ *
+ * @return true if WGMMA is supported for the current buffers, dtypes, and
+ *         transpose/shape constraints; false otherwise.
+ */
+bool GemmSPPyNode::CheckWGMMA() const {
+  return false; // not supported yet
+  // if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+  //   return false;
+  // }
+
+  // if (C->dtype == DataType::Float(16)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Float(32)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::BFloat(16) &&
+  //            B->dtype == DataType::BFloat(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::Float(32) && B->dtype ==
+  //   DataType::Float(32))
+  //     return (!trans_A) && trans_B && K % 8 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Int(32)) {
+  //   if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else {
+  //   return false;
+  // }
+}
+
+/**
+ * @brief Parse and return the numeric GPU architecture from a Target's "arch"
+ * attribute.
+ *
+ * Examines the target's "arch" string and, if it matches the pattern
+ * "sm_<num>", returns <num> as an int. If the attribute is present but does not
+ * match that pattern, returns 0.
+ *
+ * Preconditions: the target must have an "arch" attribute (this is checked via
+ * ICHECK).
+ *
+ * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
+ * the arch string does not match "sm_<num>".
+ */
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.has_value());
+  std::string arch = s.value();
+  if (arch.rfind("sm_", 0) == 0) {
+    arch_int = std::stoi(arch.substr(3));
+  } else {
+    arch_int = 0;
+  }
+  return arch_int;
+}
+
+Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.lower")) {
+    auto prim_func =
+        Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target,
+                                T.thread_bounds, T.thread_var));
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
+    ICHECK(global_symbol.has_value());
+    if (prim_func->body.as<BlockRealizeNode>()) {
+      BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+      auto block = block_realize->block;
+      {
+        BlockNode *n = block.CopyOnWrite();
+        n->name_hint = global_symbol.value();
+      }
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
+    }
+    // warp with block realize node
+    return BlockRealize(
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body));
+  } else {
+    LOG(FATAL) << "No lower function found for gemm_sp_py";
+  }
+}
+
+LayoutMap GemmSPPyNode::InferLayout(const LayoutInferArgs &T,
+                                    InferLevel level) const {
+  if (completed_)
+    return {};
+  LayoutMap results;
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.infer_layout")) {
+    results = Downcast<LayoutMap>(
+        (*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target, T.thread_bounds));
+  } else {
+    LOG(FATAL) << "No infer layout function found for gemm_sp_py";
+  }
+
+  completed_ = true;
+  return results;
+}
+
+TIR_REGISTER_TL_TILE_OP(GemmSPPy, gemm_sp_py)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() { GemmSPPyNode::RegisterReflection(); }
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
new file mode 100644
index 000000000..59c276f16
--- /dev/null
+++ b/src/op/gemm_sp_py.h
@@ -0,0 +1,96 @@
+/*!
+ * \file tl/op/gemm_sp_py.h
+ * \brief Define gemm_sp_py operator.
+ *
+ */
+
+// TODO: @botbw: remove redundant code with gemm_py.h
+
+#ifndef TVM_TL_OP_GEMM_SP_PY_H_
+#define TVM_TL_OP_GEMM_SP_PY_H_
+
+#include "gemm_sp.h"
+#include "operator.h"
+
+namespace tvm {
+
+namespace tl {
+
+using namespace tir;
+
+class GemmSPPyNode : public TileOperatorNode {
+public:
+  bool CheckWGMMA() const;
+  tir::Buffer A, E, B, C;
+  // pointer to the A, E, B, C
+  BufferRegion aRegion_, eRegion_, bRegion_, cRegion_;
+  bool trans_A, trans_B, trans_E;
+  int M, N, K;
+  int stride_A, stride_B;
+  int offset_A, offset_B;
+  PrimExpr clear_accum = const_false();
+  // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
+  // only will be enabled under cdna mfma instructions
+  int kPack = 1;
+  int wg_wait = 0;
+
+  // use GemmWarp Policy here as the atom size are flexible in v2
+  mutable GemmWarpPolicy policy;
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmSPPy", GemmSPPyNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPPyNode>()
+        .def_ro("A", &GemmSPPyNode::A)
+        .def_ro("E", &GemmSPPyNode::E)
+        .def_ro("B", &GemmSPPyNode::B)
+        .def_ro("C", &GemmSPPyNode::C)
+        .def_ro("aRegion", &GemmSPPyNode::aRegion_)
+        .def_ro("eRegion", &GemmSPPyNode::eRegion_)
+        .def_ro("bRegion", &GemmSPPyNode::bRegion_)
+        .def_ro("cRegion", &GemmSPPyNode::cRegion_)
+        .def_ro("trans_A", &GemmSPPyNode::trans_A)
+        .def_ro("trans_B", &GemmSPPyNode::trans_B)
+        .def_ro("trans_E", &GemmSPPyNode::trans_E)
+        .def_ro("M", &GemmSPPyNode::M)
+        .def_ro("N", &GemmSPPyNode::N)
+        .def_ro("K", &GemmSPPyNode::K)
+        .def_ro("stride_A", &GemmSPPyNode::stride_A)
+        .def_ro("stride_B", &GemmSPPyNode::stride_B)
+        .def_ro("offset_A", &GemmSPPyNode::offset_A)
+        .def_ro("offset_B", &GemmSPPyNode::offset_B)
+        .def_ro("clear_accum", &GemmSPPyNode::clear_accum)
+        .def_ro("kPack", &GemmSPPyNode::kPack)
+        .def_ro("wg_wait", &GemmSPPyNode::wg_wait)
+        .def_ro("policy", &GemmSPPyNode::policy);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+
+  TileOperator Clone() const;
+
+private:
+  // Target GEMM instruction
+  GemmInst GetGemmInst(int block_size, Target target) const;
+
+  mutable bool completed_ = false;
+};
+
+class GemmSPPy : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSPPy, TileOperator,
+                                             GemmSPPyNode);
+  TVM_DLL
+  GemmSPPy(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_GEMM_SP_PY_H_
diff --git a/src/op/logical.cc b/src/op/logical.cc
index 0de6658bd..38fe38cd1 100644
--- a/src/op/logical.cc
+++ b/src/op/logical.cc
@@ -42,14 +42,16 @@ TVM_REGISTER_OP("tl.any_of")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "any_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", any_of_op);
 
 TVM_REGISTER_OP("tl.all_of")
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "all_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", all_of_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/math.cc b/src/op/math.cc
index 2de21b918..b9de966ea 100644
--- a/src/op/math.cc
+++ b/src/op/math.cc
@@ -33,6 +33,7 @@ TVM_REGISTER_OP("tl.pow_of_int")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "pow_of_int")
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", pow_of_int_op)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", pow_of_int_op);
 
 PrimExpr infinity_op(PrimExpr args) {
@@ -59,7 +60,8 @@ TVM_REGISTER_OP("tl.infinity")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "infinity")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", infinity_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/operator.cc b/src/op/operator.cc
index b751559c7..0a8f6b8b8 100644
--- a/src/op/operator.cc
+++ b/src/op/operator.cc
@@ -24,16 +24,14 @@ using namespace tir;
  *
  * @param call The TIR Call whose operator and arguments will be used to build
  * the TileOperator.
- * @param vmap Buffer mapping passed through to the builder to resolve buffer
- * references.
  * @return TileOperator The constructed TileOperator, or a default (empty)
  * TileOperator if no builder exists.
  */
-TileOperator ParseOperator(Call call, BufferMap vmap) {
+TileOperator ParseOperator(Call call) {
   auto op_map = Op::GetAttrMap<OpBuilderFunc>("TLOpBuilder");
   Op op = call->op.as<Op>().value();
   if (op_map.count(op)) {
-    auto tile_op = op_map[op](call->args, vmap);
+    auto tile_op = op_map[op](call->args, call->annotations);
     ICHECK(tile_op.defined());
     return tile_op;
   }
@@ -48,14 +46,13 @@ TileOperator ParseOperator(Call call, BufferMap vmap) {
  * Otherwise returns a default-constructed (empty) TileOperator.
  *
  * @param stmt TIR statement to inspect; expected to be an Evaluate of a Call.
- * @param vmap Mapping of buffer variables used when building the operator.
  * @return TileOperator Parsed operator on success, or a default (empty)
  * TileOperator if `stmt` is not an Evaluate(Call).
  */
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap) {
+TileOperator ParseOperator(Stmt stmt) {
   if (stmt.as<Evaluate>() && stmt.as<EvaluateNode>()->value.as<CallNode>()) {
     auto call = stmt.as<EvaluateNode>()->value.as<CallNode>();
-    return ParseOperator(tvm::ffi::GetRef<Call>(call), vmap);
+    return ParseOperator(tvm::ffi::GetRef<Call>(call));
   }
   return TileOperator();
 }
diff --git a/src/op/operator.h b/src/op/operator.h
index 628b83b24..ddbe1fa6b 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -32,6 +32,20 @@ enum class InferLevel : uint8_t {
   kStrict = 2,
 };
 
+/// Convert InferLevel enum to string for debugging
+inline const char *InferLevelToString(InferLevel level) {
+  switch (level) {
+  case InferLevel::kFree:
+    return "Free";
+  case InferLevel::kCommon:
+    return "Common";
+  case InferLevel::kStrict:
+    return "Strict";
+  default:
+    return "Unknown";
+  }
+}
+
 struct LowerArgs {
   Target target;
   Range thread_bounds;
@@ -39,6 +53,9 @@ struct LowerArgs {
   AddWorkspaceCallback AddWorkspace;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 struct LayoutInferArgs {
@@ -48,6 +65,9 @@ struct LayoutInferArgs {
   arith::Analyzer *analyzer;
   bool buffer_oob = false;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 class TileOperator;
@@ -72,23 +92,24 @@ class TileOperator : public ObjectRef {
 
 Var GetVarFromAccessPtr(const PrimExpr &expr);
 
-TileOperator ParseOperator(Call call, BufferMap vmap);
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap);
+TileOperator ParseOperator(Call call);
+TileOperator ParseOperator(Stmt stmt);
 
 using OpBuilderFunc =
-    ffi::TypedFunction<TileOperator(Array<PrimExpr>, BufferMap)>;
+    ffi::TypedFunction<TileOperator(Array<PrimExpr>, Map<String, ObjectRef>)>;
 
-#define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
+#define TIR_REGISTER_TL_TILE_OP(Entry, OpName)                                 \
   const Op &Entry::Get() {                                                     \
-    static const Op &op = Op::Get("tl." #OpName);                              \
+    static const Op &op = Op::Get("tl.tileop." #OpName);                       \
     return op;                                                                 \
   }                                                                            \
-  TVM_REGISTER_OP("tl." #OpName)                                               \
+  TVM_REGISTER_OP("tl.tileop." #OpName)                                        \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)             \
-      .set_attr<OpBuilderFunc>("TLOpBuilder",                                  \
-                               [](Array<PrimExpr> args, BufferMap vmap) {      \
-                                 return Entry(args, vmap);                     \
-                               })
+      .set_attr<OpBuilderFunc>(                                                \
+          "TLOpBuilder",                                                       \
+          [](Array<PrimExpr> args, Map<String, ObjectRef> annotations) {       \
+            return Entry(args, annotations);                                   \
+          })
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 81777aa53..b93467b54 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -8,78 +8,21 @@
 #include <algorithm>
 #include <tvm/tir/op.h>
 
+#include "../layout/layout.h"
+#include "arith/int_operator.h"
+
 #include "../layout/utils.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-namespace attr {
-/*! \brief Mark that how the loop is vectorized. */
-constexpr const char *coalesced_width = "coalesced_width";
-} // namespace attr
-
-// ProveFragmentContains checks whether the threads that access elements of a
-// smaller fragment (small_frag) are a subset of the threads that access
-// elements of a larger fragment (large_frag) for any given loop index. This
-// function ensures that if the small fragment's layout corresponds to the loop
-// itself, accessing the large fragment's elements is valid. Additionally, if
-// small is updated to large, the originally valid access remains valid. The
-// proof is performed by:
-//
-// 1. Defining a variable `rep_small` to represent the replicate index of the
-//    small fragment that is being checked.
-// 2. Using the `small_frag_indices` and `rep_small` to derive the thread
-// accessing
-//    the element in the small fragment.
-// 3. Using `large_frag_indices` to derive the physical index of the large
-// fragment
-//    along with the thread information, and then feeding these into the inverse
-//    of the large fragment to obtain the logical index and replicate index.
-// 4. Verifying the mapping by checking whether the computed thread using the
-// inverse
-//    layout corresponds to the original thread calculated for the small
-//    fragment. If they don't match, this indicates that the inverse layout's
-//    domain does not include the thread and thus the access is invalid.
-bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
-                           Array<PrimExpr> small_frag_indices,
-                           Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_) {
-  Var rep_small("__checking_frag_contains_rep");
-  analyzer_.Bind(rep_small,
-                 Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
-                       small_frag->ReplicateExtent()),
-                 true); // Bind the replicate extent of small_frag.
-  // Derive thread for small_frag.
-  auto thread = small_frag->ForwardThread(small_frag_indices, rep_small);
-
-  // Get physical index and thread for large_frag.
-  auto large_frag_physical_and_thread = large_frag->Forward(large_frag_indices);
-  // Add small_frag's thread to the large fragment's thread info.
-  large_frag_physical_and_thread.push_back(thread);
-  // Get the inverse of the large fragment.
-  auto inv_large_frag = large_frag->Inverse();
-  // Compute logical index and replicate index using inverse layout.
-  auto inv_large_frag_logical_and_rep =
-      inv_large_frag->Forward(large_frag_physical_and_thread);
-
-  // Extract replicate index from the result.
-  auto inv_large_frag_rep =
-      inv_large_frag_logical_and_rep[inv_large_frag_logical_and_rep.size() - 1];
-
-  // Calculate thread based on the logical index and replicate index.
-  auto check_thread =
-      large_frag->ForwardThread(large_frag_indices, inv_large_frag_rep);
-
-  // Simplify the difference between the threads.
-  auto diff = analyzer_.Simplify(thread - check_thread);
-  // If the difference is zero, the threads match and the access is valid.
-  return is_zero(diff);
-}
+namespace {
 
 class IfBufferRemapLoopGenerator : public StmtExprMutator {
 public:
@@ -120,6 +63,8 @@ class IfBufferRemapLoopGenerator : public StmtExprMutator {
   Map<Buffer, Layout> layout_map_;
 };
 
+} // anonymous namespace
+
 /**
  * @brief Handle a parallel For node during traversal, collecting loop metadata.
  *
@@ -147,34 +92,47 @@ void ParallelLoopNestVisitor::VisitStmt_(const ForNode *op) {
 }
 
 void ParallelLoopNestVisitor::VisitStmt_(const BufferStoreNode *op) {
-  if (op->buffer.scope() == "local.fragment") {
-    if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
-      ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
-          << op->buffer << ": " << op->indices << " and "
-          << p->indice_map_.at(op->buffer);
-    } else {
-      p->indice_map_.Set(op->buffer, op->indices);
-    }
-    p->buffer_is_write_.insert(op->buffer);
+  if (IsFragmentBuffer(op->buffer)) {
+    p->RecordBufferAccess(op->buffer, op->indices, /*is_write=*/true);
   }
   StmtExprVisitor::VisitStmt_(op);
 }
 
 void ParallelLoopNestVisitor::VisitExpr_(const BufferLoadNode *op) {
-  if (op->buffer.scope() == "local.fragment") {
-    if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
-      ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
-          << op->buffer << ": " << op->indices << " and "
-          << p->indice_map_.at(op->buffer);
-    } else {
-      p->indice_map_.Set(op->buffer, op->indices);
-    }
+  if (IsFragmentBuffer(op->buffer)) {
+    p->RecordBufferAccess(op->buffer, op->indices, /*is_write=*/false);
   }
   StmtExprVisitor::VisitExpr_(op);
 }
 
 ParallelOpNode::ParallelOpNode(For root) : root_(root), V(this) {
   V.VisitStmt(root);
+  // Cache any annotated layout/predicate on the outermost loop.
+  using namespace attr;
+  if (root_->annotations.count(kParallelLoopLayout)) {
+    annotated_layout_unbound_ =
+        Downcast<Fragment>(root_->annotations.Get(kParallelLoopLayout).value());
+  }
+  if (root_->annotations.count(kParallelLoopPredicate)) {
+    annotated_predicate_ = Downcast<PrimExpr>(
+        root_->annotations.Get(kParallelLoopPredicate).value());
+  }
+  // Collect cross-thread access info and buffer store info.
+  PostOrderVisit(root_, [&](const ObjectRef &obj) {
+    if (const auto *store = obj.as<BufferStoreNode>()) {
+      auto buffer = store->buffer;
+      if (IsSharedBuffer(buffer) || IsGlobalBuffer(buffer)) {
+        has_cross_thread_access_ = true;
+        store_shared_global_buffers_.emplace_back(buffer);
+      } else if (IsFragmentBuffer(buffer)) {
+        store_fragment_buffers_.emplace_back(buffer);
+      }
+    } else if (const auto *load = obj.as<BufferLoadNode>()) {
+      if (IsSharedBuffer(load->buffer) || IsGlobalBuffer(load->buffer)) {
+        has_cross_thread_access_ = true;
+      }
+    }
+  });
 }
 
 TileOperator ParallelOpNode::Clone() const {
@@ -182,14 +140,97 @@ TileOperator ParallelOpNode::Clone() const {
   return ParallelOp(op);
 }
 
+void ParallelOpNode::ExpandLetBindings(
+    const Map<Var, PrimExpr> &let_var_to_expr) {
+  if (let_var_to_expr.empty())
+    return;
+
+  // Helper function to recursively find BufferLoads through let bindings
+  std::function<void(const PrimExpr &)> expand = [&](const PrimExpr &expr) {
+    PostOrderVisit(expr, [&](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (IsFragmentBuffer(bl->buffer)) {
+          RecordBufferAccess(bl->buffer, bl->indices, /*is_write=*/false);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr.count(var)) {
+          expand(let_var_to_expr[var]);
+        }
+      }
+    });
+  };
+
+  // Only expand let bindings that are used in root_
+  // First, collect all vars used in root_
+  std::unordered_set<const VarNode *> used_vars;
+  PostOrderVisit(root_, [&](const ObjectRef &node) {
+    if (auto var_node = node.as<VarNode>()) {
+      used_vars.insert(var_node);
+    }
+  });
+
+  // Only expand let bindings for vars that are actually used in root_
+  for (const auto &[var, expr] : let_var_to_expr) {
+    if (used_vars.count(var.get())) {
+      expand(expr);
+    }
+  }
+}
+
+void ParallelOpNode::RecordBufferAccess(const Buffer &buffer,
+                                        const Array<PrimExpr> &indices,
+                                        bool is_write) {
+  auto it = indice_map_.find(buffer);
+  if (it != indice_map_.end()) {
+    ICHECK(StructuralEqual()(it->second.indices, indices))
+        << buffer << ": " << indices << " and " << it->second.indices;
+  } else {
+    BufferAccessInfo info;
+    info.indices = indices;
+    it = indice_map_.emplace(buffer, std::move(info)).first;
+  }
+  if (is_write) {
+    it->second.is_write = true;
+  } else {
+    it->second.is_read = true;
+  }
+}
+
+const ParallelOpNode::BufferAccessInfo &
+ParallelOpNode::GetAccessInfo(const Buffer &buffer) const {
+  auto it = indice_map_.find(buffer);
+  ICHECK(it != indice_map_.end())
+      << "Missing access info for buffer " << buffer;
+  return it->second;
+}
+
+bool ParallelOpNode::IsBufferCompletelyReplicated(
+    const Buffer &buffer, const LayoutMap &layout_map) const {
+  if (!IsFragmentBuffer(buffer))
+    return false;
+  auto frag = layout_map[buffer].as<Fragment>().value();
+  // buffer indices should be IntImm
+  for (const auto &index : GetAccessInfo(buffer).indices) {
+    if (!index.as<IntImmNode>()) {
+      return false;
+    } else if (index.as<IntImmNode>()->value != 0) {
+      LOG(FATAL) << "buffer " << buffer << " is not completed replicated";
+    }
+  }
+  return frag->IsCompletedReplicated();
+}
+
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
 }
 
+// (annotations parsed in ctor; adoption happens in InferLayout)
+
 bool ParallelOpNode::IsCommonAccessIndice(const Buffer &buffer) const {
   auto common_indice = loop_vars_.Map([](const auto &iv) { return iv->var; });
-  return StructuralEqual()(indice_map_[buffer], common_indice);
+  return StructuralEqual()(GetAccessInfo(buffer).indices, common_indice);
 }
 
 /*! \brief Infer the layout for parallel operations based on different inference
@@ -212,8 +253,14 @@ bool ParallelOpNode::IsCommonAccessIndice(const Buffer &buffer) const {
  */
 LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
-  if (loop_layout_.defined())
+  if (loop_layout_inferred_)
     return {};
+
+  // Expand let bindings to find fragment buffer accesses
+  if (!T.let_var_to_expr.empty()) {
+    const_cast<ParallelOpNode *>(this)->ExpandLetBindings(T.let_var_to_expr);
+  }
+
   if (level == InferLevel::kStrict) {
     LayoutMap results;
     // Deduce buffers that should be complicated replicated.
@@ -221,16 +268,16 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
     // for i in T.Parallel(m):
     //   fragment[0] = x[i]
     // then fragment[0] must be replicated on all threads.
-    for (const auto &[buffer, indices] : indice_map_) {
+    for (const auto &[buffer, access] : indice_map_) {
       if (T.layout_map.count(buffer)) {
         continue;
       }
-      if (buffer.scope() != "local.fragment")
+      if (!IsFragmentBuffer(buffer))
         continue;
 
       // Check if all indices are zero
       bool all_indices_zero = true;
-      for (const auto &index : indices) {
+      for (const auto &index : access.indices) {
         if (const auto *imm = index.as<IntImmNode>()) {
           if (imm->value != 0) {
             all_indices_zero = false;
@@ -252,44 +299,32 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
           forward_vars.push_back(
               IterVar(Range(0, s), Var(), IterVarType::kDataPar));
         }
-        Array<PrimExpr> forward_index;
-        for (const auto &iv : forward_vars) {
-          forward_index.push_back(iv->var);
-        }
         Var rep;
         auto rep_iter =
             IterVar({0, T.thread_bounds->extent}, rep, IterVarType::kDataPar);
 
+        // Use default fragment indexing (single output dim) to
+        // stay consistent with other ops (e.g., ReduceOp), and
+        // bind the thread range for comparability.
         const PrimExpr &forward_thread = rep;
-        results.Set(buffer, Fragment(forward_vars, forward_index,
-                                     forward_thread, rep_iter));
+        auto frag = Fragment(forward_vars, /*forward_index=*/{}, forward_thread,
+                             rep_iter)
+                        ->BindThreadRange(T.thread_bounds);
+        results.Set(buffer, frag);
       }
     }
     return results;
   }
-  auto buffer_is_completed_replicated = [&](const Buffer &buffer) {
-    if (buffer.scope() != "local.fragment")
-      return false;
-    auto frag = T.layout_map[buffer].as<Fragment>().value();
-    // buffer indices should be IntImm
-    for (const auto &index : indice_map_[buffer]) {
-      if (!index.as<IntImmNode>()) {
-        return false;
-      } else if (index.as<IntImmNode>()->value != 0) {
-        LOG(FATAL) << "buffer " << buffer << " is not completed replicated";
-      }
-    }
-    return frag->IsCompletedReplicated();
-  };
+
   // Collect fragment buffers with const index and all fragment_buffers
   std::vector<Buffer> const_index_fragment_buffer, fragment_buffers;
-  for (const auto &[buffer, indices] : indice_map_) {
-    if (buffer.scope() != "local.fragment")
+  for (const auto &[buffer, access] : indice_map_) {
+    if (!IsFragmentBuffer(buffer))
       continue;
     fragment_buffers.push_back(buffer);
 
     bool is_const_index = true;
-    for (const auto &index : indices) {
+    for (const auto &index : access.indices) {
       if (!index.as<IntImmNode>()) {
         is_const_index = false;
         break;
@@ -317,7 +352,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   Buffer source_buffer, read_source_buffer;
   Buffer replicated_write_buffer; // Backup: fully replicated write buffer
 
-  for (const auto &[buffer, indices] : indice_map_) {
+  for (const auto &[buffer, access] : indice_map_) {
     if (T.layout_map.count(buffer)) {
       // skip reducers with rep=ALL
       if (auto info = reducer_info_map_.Get(buffer->data);
@@ -325,9 +360,10 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
         continue;
 
       auto frag = T.layout_map[buffer].as<Fragment>().value();
-      bool is_fully_replicated = buffer_is_completed_replicated(buffer);
+      bool is_fully_replicated =
+          IsBufferCompletelyReplicated(buffer, T.layout_map);
 
-      if (buffer_is_write_.count(buffer)) {
+      if (access.is_write) {
         source_buffer = buffer;
       } else {
         // Keep the buffer with largest number of indices
@@ -336,8 +372,8 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
         // if the buffer is completed replicated, we don't need to infer the
         // layout from this buffer.
         if ((!read_source_buffer.defined() ||
-             indice_map_[buffer].size() >
-                 indice_map_[read_source_buffer].size())) {
+             access.indices.size() >
+                 GetAccessInfo(read_source_buffer).indices.size())) {
           read_source_buffer = buffer;
         }
         // If the buffer is not replicated and shape is equal to the
@@ -349,220 +385,72 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       }
     }
   }
-  auto compute_loop_layout_from_buffer = [&](const Buffer &buffer) {
-    Fragment src_layout = T.layout_map[buffer].as<Fragment>().value();
-    DLOG(INFO) << "[compute_loop_layout_from_buffer] infer from buffer `"
-               << buffer << "` of layout " << src_layout->DebugOutput() << '\n';
-
-    Fragment result;
-    if (IsCommonAccessIndice(buffer)) {
-      result = src_layout;
-    } else {
-      Var rep;
-      auto rep_iter = IterVar({0, src_layout->ReplicateExtent()}, rep,
-                              IterVarType::kDataPar);
-      PrimExpr loop_var_to_thread =
-          src_layout->ForwardThread(indice_map_[buffer], rep);
-      loop_var_to_thread = analyzer_.Simplify(loop_var_to_thread);
-      PostOrderVisit(loop_var_to_thread, [&](const ObjectRef &objref) {
-        if (auto opt_var = objref.as<Var>();
-            opt_var && inner_vars_.count(*opt_var)) {
-          std::ostringstream oss;
-          oss << "loop_var_to_thread = " << loop_var_to_thread
-              << "contains inner var" << *opt_var;
-          throw LayoutConflictException(oss.str());
-        }
-      });
-
-      try {
-        result = Fragment(loop_vars_, {}, loop_var_to_thread, rep_iter)
-                     ->BindThreadRange(T.thread_bounds);
-      } catch (const tvm::runtime::Error &err) {
-        std::ostringstream msg;
-        msg << "Layout inference for buffer `" << buffer->name
-            << "` failed inside `T.parallel` loop.";
-
-        msg << "\nUnderlying TVM error: " << err.what();
-        msg << "\nProblematic loop AST:\n " << root_;
-        msg << "\nHint: ensure the loop extent divides the thread binding or "
-               "adjust the fragment mapping.";
-        LOG(FATAL) << msg.str();
-      }
+  // moved to ComputeLoopLayoutFromBuffer
+
+  // Try to infer loop layout from buffers in order of preference only if we
+  // don't already have a layout (e.g., from annotations):
+  // 1. Annotated loop layout
+  // 2. Non-replicated write buffer (most reliable)
+  // 3. Non-replicated read buffer
+  // 4. Fully replicated write buffer (backup, may cause issues)
+  // 5. Free inference mode (no source buffer)
+  if (!loop_layout_.defined() && annotated_layout_unbound_.defined()) {
+    loop_layout_ =
+        annotated_layout_unbound_.value()->BindThreadRange(T.thread_bounds);
+    if (annotated_predicate_.defined()) {
+      predicate_ = annotated_predicate_.value();
     }
-    DLOG(INFO) << "[compute_loop_layout_from_buffer] ... and get "
-               << result->DebugOutput() << '\n';
-    return result;
-  };
-
-  // Try to infer loop layout from buffers in order of preference:
-  // 1. Non-replicated write buffer (most reliable)
-  // 2. Non-replicated read buffer
-  // 3. Fully replicated write buffer (backup, may cause issues)
-  // 4. Free inference mode (no source buffer)
-
-  if (source_buffer.defined() && allow_layout_propgate) {
-    loop_layout_ = compute_loop_layout_from_buffer(source_buffer);
-  } else if (level == InferLevel::kFree) {
+  } else if (!loop_layout_.defined() && source_buffer.defined() &&
+             allow_layout_propgate) {
+    loop_layout_ = ComputeLoopLayoutFromBuffer(source_buffer, T);
+  } else if (!loop_layout_.defined() && level == InferLevel::kFree) {
     // For free layout inference
-    // If replication exists and buffer has cross-thread shared memory access,
-    // add predicate
-    bool has_cross_thread_access = false;
-    PostOrderVisit(root_, [&](const ObjectRef &obj) {
-      if (const auto *store = obj.as<BufferStoreNode>()) {
-        // check if scope is shared or global
-        if (store->buffer.scope() == "shared" ||
-            store->buffer.scope() == "shared.dyn" ||
-            store->buffer.scope() == "global") {
-          has_cross_thread_access = true;
-        }
-      } else if (const auto *load = obj.as<BufferLoadNode>()) {
-        // check if scope is shared or global
-        if (load->buffer.scope() == "shared" ||
-            load->buffer.scope() == "shared.dyn" ||
-            load->buffer.scope() == "global") {
-          has_cross_thread_access = true;
-        }
-      }
-    });
+    // In free inference, try two mechanisms and prefer the one that
+    // minimizes replication while remaining compatible:
+    // 1) compute_loop_layout_from_buffer (always correct but may
+    // over-replicate) 2) PlanLoopPartition (often smaller replication)
+    Fragment candidate_from_buffer;
+    Fragment candidate_from_plan;
 
-    // check if loop body contains a "pure" buffer store (i.e., direct
-    // assignment, not compound update)
-    std::vector<Buffer> store_shared_global_buffers, store_fragment_buffers;
-    // Buffers that scope is above fragments.
-    // global, shared, shared.dyn
-    // which can be used to analysis replicate case
-    PostOrderVisit(root_, [&](const ObjectRef &obj) {
-      if (const auto *store = obj.as<BufferStoreNode>()) {
-        auto buffer = store->buffer;
-        if (buffer.scope() == "shared" || buffer.scope() == "shared.dyn" ||
-            buffer.scope() == "global") {
-          store_shared_global_buffers.emplace_back(buffer);
-        } else if (buffer.scope() == "local.fragment") {
-          store_fragment_buffers.emplace_back(buffer);
-        }
-      }
-    });
     if (read_source_buffer.defined() && allow_layout_propgate) {
-      loop_layout_ = compute_loop_layout_from_buffer(read_source_buffer);
+      candidate_from_buffer =
+          ComputeLoopLayoutFromBuffer(read_source_buffer, T);
     }
 
-    if (!loop_layout_.defined()) {
-      // No source buffer available, use free mode inference
-      // Vectorize Size must be aware of the buffer_remap
-      // As the pass will do post processing to the layout
-      auto maybe_remapped_root_ =
-          IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
-      int vector_size = GetVectorizeSize(maybe_remapped_root_);
-
-      DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
-
-      PrimExpr loop_total_size = 1;
-      for (Stmt l = root_; l.as<For>().has_value();
-           l = l.as<For>().value()->body)
-        loop_total_size = loop_total_size * l.as<For>().value()->extent;
-      DLOG(INFO) << "[PlanLoopPartition] loop_total_size = " << loop_total_size
-                 << '\n';
-      while (!analyzer_.CanProve(
-                 floormod(loop_total_size,
-                          T.thread_bounds->extent * vector_size) == 0) &&
-             vector_size > 1)
-        vector_size /= 2;
-      DLOG(INFO) << "[PlanLoopPartition] after adjust: vector_size = "
-                 << vector_size << '\n';
-
-      // Check if coalesced_width is defined
-      if (auto coalesced_width =
-              root_->annotations.Get(tl::attr::coalesced_width)) {
-        if (const auto *imm = coalesced_width->as<IntImmNode>()) {
-          int expected = imm->value;
-          // Verify that vector_size is divisible by expected
-          if (vector_size % expected != 0) {
-            LOG(FATAL) << "Vector size " << vector_size
-                       << " is not divisible by coalesced width " << expected;
-          }
-          vector_size = expected;
-        } else {
-          LOG(FATAL) << "coalesced_width should be an IntImmNode.";
-        }
-      }
-      DLOG(INFO) << "[PlanLoopPartition] root_ = " << root_
-                 << " ############# vector_size = " << vector_size
-                 << ", thread_bounds = " << T.thread_bounds << '\n';
-      loop_layout_ = PlanLoopPartition(root_, vector_size, T.thread_bounds);
-      DLOG(INFO) << "[PlanLoopPartition] loop_layout_ = "
-                 << loop_layout_->DebugOutput() << '\n';
+    // try to infer loop layout with two mechanisms and choose the best one
+    {
+      candidate_from_plan = ComputePlanCandidate(T);
     }
 
-    // Lambda that guards replicated accesses:
-    // - When a loop layout replicates a fragment buffer (rep > 1), each thread
-    //   observes the same fragment elements. Blindly storing to shared/global
-    //   memory in that case would add the same value multiple times.
-    // - We therefore restrict the store so that only the replica with rep == 0
-    //   performs the update (e.g. global[i] += fragment[i] only fires once).
-    // Trigger conditions for this guard:
-    // 1) There are cross-thread stores targeting shared/global memory (no
-    //    fragment stores in this branch; atomic_add and similar remain TODO).
-    // 2) The loop layout replicate extent is greater than 1, inferred from the
-    //    thread bounds captured in the layout.
-
-    [this, &store_shared_global_buffers, &store_fragment_buffers,
-     &has_cross_thread_access, &const_index_fragment_buffer, &T]() {
-      if (is_one(loop_layout_->ReplicateExtent()))
-        return;
-      if (!has_cross_thread_access)
-        return;
-
-      if (!store_fragment_buffers.empty()) {
-        // Iterate replicated fragment stores: when the fragment index is a
-        // constant (e.g. fragment[0]), every thread touches the same slot, so
-        // the rep == 0 predicate is unnecessary. Example: for i in
-        // T.Parallel(...):
-        //   shared[i] = ...
-        //   fragment[0] = ...
-        bool replicate_is_from_dynamic_index_fragment = false;
-        for (const auto &fragment : store_fragment_buffers) {
-          if (!T.layout_map.count(fragment)) {
-            continue;
-          }
-
-          auto fragment_layout = T.layout_map[fragment].as<Fragment>().value();
-          if (is_one(fragment_layout->ReplicateExtent()))
-            continue;
-
-          if (analyzer_.CanProveEqual(fragment_layout->ReplicateExtent(),
-                                      loop_layout_->ReplicateExtent()))
-            continue;
-          if (std::find(const_index_fragment_buffer.begin(),
-                        const_index_fragment_buffer.end(),
-                        fragment) == const_index_fragment_buffer.end()) {
-            replicate_is_from_dynamic_index_fragment = true;
-          }
-        }
-
-        if (!replicate_is_from_dynamic_index_fragment)
-          return;
-
-        ICHECK(store_shared_global_buffers.empty())
-            << "Invalid layout: cannot have both fragment and shared store "
-               "buffers "
-               "in replicated loop layout.";
-        return;
-      } else {
-        // Now, store is global or shared
-        // or T.call_extern or T.call_intrin ...
-        auto inv = loop_layout_->Inverse();
-        Array<PrimExpr> fwd;
-        for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
-          fwd.push_back(0);
-        fwd.push_back(InputPlaceholder(0) - T.thread_bounds->min);
-        auto rep = inv->Forward(fwd).back();
-        AddPredicate(EQ(rep, 0));
-      }
-    }();
-  } else {
+    // Choose the best candidate:
+    if (candidate_from_buffer.defined() && candidate_from_plan.defined()) {
+      loop_layout_ =
+          ChooseBestCandidate(candidate_from_buffer, candidate_from_plan, T);
+    } else if (candidate_from_plan.defined()) {
+      loop_layout_ = candidate_from_plan;
+      DLOG(INFO) << "[FreeInfer] only PlanLoopPartition available, choose it.";
+    } else if (candidate_from_buffer.defined()) {
+      loop_layout_ = candidate_from_buffer;
+      DLOG(INFO)
+          << "[FreeInfer] only compute_from_buffer available, choose it.";
+    }
+  } else if (!loop_layout_.defined()) {
+    // In non-free mode without a source buffer, if we don't have any layout
+    // yet (e.g., no annotation), we have nothing to infer here.
     return {};
   }
 
+  // check loop_layout_ is injective
+  auto injective_res = loop_layout_->DetectInjective();
+  if (!injective_res->errors.empty()) {
+    std::ostringstream oss;
+    oss << "Loop layout is not injective: " << loop_layout_->DebugOutput()
+        << '\n'
+        << "  errors: " << injective_res->errors << '\n'
+        << "  loop AST: " << root_;
+    throw LoopLayoutInjectiveException(oss.str());
+  }
+
   PrimExpr loop_thread_extent = loop_layout_->ThreadExtent();
 
   auto block_size = T.thread_bounds->extent;
@@ -582,28 +470,26 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   }
 
   // Step 2: Check that the loop's partition can correctly align with all source
-  // fragment, and infer layout only when it's not yet layout-ed
+  // fragment, and infer layout only when it's not yet layout-ed.
+  ValidateCandidateAgainstFragments(loop_layout_, T, /*throw_on_error=*/true,
+                                    /*check_forward_index=*/false,
+                                    source_buffer);
+
+  // Step 3: Build replication guards
+  BuildReplicationGuardsIfNeeded(
+      T, store_shared_global_buffers_, store_fragment_buffers_,
+      has_cross_thread_access_, const_index_fragment_buffer);
+
+  // Step 4: Collect buffer fragments
   LayoutMap results;
-  for (const auto &[buffer, _] : indice_map_) {
-    if (T.layout_map.count(buffer)) {
-      auto fragment = T.layout_map[buffer].as<Fragment>().value();
-      auto vars =
-          loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
-      if (!ProveFragmentContains(loop_layout_, fragment, vars,
-                                 indice_map_[buffer], analyzer_)) {
-        std::ostringstream oss;
-        oss << "Layout infer conflict between " << buffer << " and "
-            << source_buffer << " in T.Parallel loop:" << '\n'
-            << "    loop " << loop_layout_->DebugOutput() << '\n'
-            << "    fragment " << fragment->DebugOutput() << '\n';
-        throw LayoutConflictException(oss.str());
-      }
-    } else {
+  for (const auto &[buffer, access] : indice_map_) {
+    if (!T.layout_map.count(buffer)) {
       auto dst_layout =
           CompleteBufferFragment(buffer)->BindThreadRange(T.thread_bounds);
       results.Set(buffer, dst_layout);
     }
   }
+  loop_layout_inferred_ = true;
   return results;
 }
 
@@ -624,11 +510,12 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   // them directly and avoid introducing a synthetic replicate dimension.
   {
     auto res2d =
-        arith::DetectIterMap(indice_map_[buffer], ToVMap(loop_vars_), 1,
-                             arith::IterMapLevel::Bijective,
+        arith::DetectIterMap(GetAccessInfo(buffer).indices, ToVMap(loop_vars_),
+                             1, arith::IterMapLevel::Bijective,
                              const_cast<arith::Analyzer *>(&analyzer_));
     if (res2d->errors.empty()) {
-      Layout ind_inv2d = Layout(loop_vars_, indice_map_[buffer])->Inverse();
+      Layout ind_inv2d =
+          Layout(loop_vars_, GetAccessInfo(buffer).indices)->Inverse();
       PrimExpr indice_rep_extent = 1;
       PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
       PrimExpr dest_buffer_rep_extent = indice_rep_extent * loop_rep_extent;
@@ -645,9 +532,9 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   }
   // Otherwise, infer an extra flattened iterator that captures truly-unused
   // pieces of the loop space (if any), then try inversion with it.
-  PrimExpr rep_b = MakeFlattenedExpression(
-      DivideUnusedIterators(indice_map_[buffer], loop_vars_, &analyzer_));
-  auto bijective_indice = indice_map_[buffer];
+  PrimExpr rep_b = MakeFlattenedExpression(DivideUnusedIterators(
+      GetAccessInfo(buffer).indices, loop_vars_, &analyzer_));
+  auto bijective_indice = GetAccessInfo(buffer).indices;
   bijective_indice.push_back(rep_b);
   Layout ind_inv = Layout(loop_vars_, bijective_indice)->Inverse();
 
@@ -670,5 +557,264 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
 
 TVM_FFI_STATIC_INIT_BLOCK() { ParallelOpNode::RegisterReflection(); }
 
+bool ParallelOpNode::ValidateCandidateAgainstFragments(
+    const Fragment &candidate, const LayoutInferArgs &T, bool throw_on_error,
+    bool check_forward_index, const Buffer &source_buffer) const {
+  auto vars =
+      loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
+  for (const auto &[buffer, access] : indice_map_) {
+    if (!T.layout_map.count(buffer))
+      continue;
+    if (auto info = reducer_info_map_.Get(buffer->data);
+        info && info.value()->rep == ReducerRepType::ALL)
+      continue;
+    auto fragment = T.layout_map[buffer].as<Fragment>().value();
+    std::ostringstream oss;
+    bool success = true;
+    if (access.is_read &&
+        !ProveFragmentContains(candidate, fragment, vars, access.indices,
+                               analyzer_, check_forward_index)) {
+      if (throw_on_error) {
+        oss << "Layout infer conflict between " << buffer << " and "
+            << source_buffer << " in T.Parallel loop:" << '\n'
+            << "    loop " << candidate->DebugOutput() << '\n'
+            << "    fragment " << fragment->DebugOutput() << '\n';
+      }
+      success = false;
+    }
+    if (access.is_write &&
+        !ProveFragmentContains(fragment, candidate, access.indices, vars,
+                               analyzer_, check_forward_index)) {
+      if (throw_on_error) {
+        oss << "Layout infer conflict between " << buffer << " and "
+            << source_buffer << " in T.Parallel loop:" << '\n'
+            << "    loop " << candidate->DebugOutput() << '\n'
+            << "    fragment " << fragment->DebugOutput() << '\n';
+      }
+      success = false;
+    }
+    if (!success) {
+      if (throw_on_error) {
+        throw LayoutConflictException(oss.str());
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+Fragment
+ParallelOpNode::ComputeLoopLayoutFromBuffer(const Buffer &buffer,
+                                            const LayoutInferArgs &T) const {
+  Fragment src_layout = T.layout_map[buffer].as<Fragment>().value();
+  DLOG(INFO) << "[compute_loop_layout_from_buffer] infer from buffer `"
+             << buffer << "` of layout " << src_layout->DebugOutput() << '\n';
+
+  Fragment result;
+
+  if (IsCommonAccessIndice(buffer)) {
+    result = src_layout;
+  } else {
+    Var rep("_rep");
+    auto rep_iter =
+        IterVar({0, src_layout->ReplicateExtent()}, rep, IterVarType::kDataPar);
+    PrimExpr loop_var_to_thread =
+        src_layout->ForwardThread(GetAccessInfo(buffer).indices, rep);
+    loop_var_to_thread = analyzer_.Simplify(loop_var_to_thread);
+    PostOrderVisit(loop_var_to_thread, [&](const ObjectRef &objref) {
+      if (auto opt_var = objref.as<Var>();
+          opt_var && inner_vars_.count(*opt_var)) {
+        std::ostringstream oss;
+        oss << "loop_var_to_thread = " << loop_var_to_thread
+            << "contains inner var" << *opt_var;
+        throw LayoutConflictException(oss.str());
+      }
+    });
+
+    try {
+      result = Fragment(loop_vars_, {}, loop_var_to_thread, rep_iter)
+                   ->BindThreadRange(T.thread_bounds);
+    } catch (const tvm::runtime::Error &err) {
+      std::ostringstream msg;
+      msg << "Layout inference for buffer `" << buffer->name
+          << "` failed inside `T.parallel` loop.";
+
+      msg << "\nUnderlying TVM error: " << err.what();
+      msg << "\nProblematic loop AST:\n " << root_;
+      msg << "\nHint: ensure the loop extent divides the thread binding or "
+             "adjust the fragment mapping.";
+      LOG(FATAL) << msg.str();
+    }
+  }
+  DLOG(INFO) << "[compute_loop_layout_from_buffer] ... and get "
+             << result->DebugOutput() << '\n';
+  // Lei: This is a tradeoff, disable it for now.
+  // // Try DeReplicate first to reduce replication if possible.
+  // Fragment dereplicated_layout = candidate_from_buffer->DeReplicate();
+  // if (ValidateCandidateAgainstFragments(
+  //         dereplicated_layout, T, /*throw_on_error=*/false,
+  //         /*check_forward_index=*/false,
+  //         /*source_buffer=*/read_source_buffer)) {
+  //   candidate_from_buffer = dereplicated_layout;
+  // }
+  return result;
+}
+
+Fragment ParallelOpNode::ComputePlanCandidate(const LayoutInferArgs &T) const {
+  // Vectorize Size must be aware of the buffer_remap
+  // As the pass will do post processing to the layout
+  auto maybe_remapped_root_ =
+      IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
+  int vector_size =
+      GetVectorizeSize(maybe_remapped_root_, T.analyzer, T.layout_map);
+  DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
+
+  PrimExpr loop_total_size = 1;
+  for (Stmt l = root_; l.as<For>().has_value(); l = l.as<For>().value()->body)
+    loop_total_size = loop_total_size * l.as<For>().value()->extent;
+  DLOG(INFO) << "[PlanLoopPartition] loop_total_size = " << loop_total_size
+             << '\n';
+  while (!analyzer_.CanProve(floormod(loop_total_size, T.thread_bounds->extent *
+                                                           vector_size) == 0) &&
+         vector_size > 1)
+    vector_size /= 2;
+  DLOG(INFO) << "[PlanLoopPartition] after adjust: vector_size = "
+             << vector_size << '\n';
+
+  // Check if coalesced_width is defined
+  if (auto coalesced_width = root_->annotations.Get(attr::kCoalescedWidth)) {
+    if (const auto *imm = coalesced_width->as<IntImmNode>()) {
+      int expected = imm->value;
+      // Verify that vector_size is divisible by expected
+      if (vector_size % expected != 0) {
+        LOG(FATAL) << "Vector size " << vector_size
+                   << " is not divisible by coalesced width " << expected;
+      }
+      vector_size = expected;
+    } else {
+      LOG(FATAL) << "coalesced_width should be an IntImmNode.";
+    }
+  }
+  DLOG(INFO) << "[PlanLoopPartition] root_ = " << root_
+             << " ############# vector_size = " << vector_size
+             << ", thread_bounds = " << T.thread_bounds << '\n';
+  auto plan = PlanLoopPartition(root_, vector_size, T.thread_bounds);
+  DLOG(INFO) << "[PlanLoopPartition] candidate = " << plan->DebugOutput()
+             << '\n';
+  return plan;
+}
+
+void ParallelOpNode::BuildReplicationGuardsIfNeeded(
+    const LayoutInferArgs &T,
+    const std::vector<Buffer> &store_shared_global_buffers,
+    const std::vector<Buffer> &store_fragment_buffers,
+    bool has_cross_thread_access,
+    const std::vector<Buffer> &const_index_fragment_buffer) const {
+  if (is_one(loop_layout_->ReplicateExtent()))
+    return;
+  if (!has_cross_thread_access)
+    return;
+
+  if (!store_fragment_buffers.empty()) {
+    bool replicate_is_from_dynamic_index_fragment = false;
+    for (const auto &fragment : store_fragment_buffers) {
+      if (!T.layout_map.count(fragment)) {
+        continue;
+      }
+
+      auto fragment_layout = T.layout_map[fragment].as<Fragment>().value();
+      if (is_one(fragment_layout->ReplicateExtent()))
+        continue;
+
+      if (analyzer_.CanProveEqual(fragment_layout->ReplicateExtent(),
+                                  loop_layout_->ReplicateExtent()))
+        continue;
+      if (std::find(const_index_fragment_buffer.begin(),
+                    const_index_fragment_buffer.end(),
+                    fragment) == const_index_fragment_buffer.end()) {
+        replicate_is_from_dynamic_index_fragment = true;
+      }
+    }
+
+    if (!replicate_is_from_dynamic_index_fragment)
+      return;
+
+    ICHECK(store_shared_global_buffers.empty())
+        << "Invalid layout: cannot have both fragment and shared store buffers "
+           "in replicated loop layout.";
+    return;
+  } else {
+    auto inv = loop_layout_->Inverse();
+    Array<PrimExpr> fwd;
+    for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
+      fwd.push_back(0);
+    fwd.push_back(InputPlaceholder(0) - T.thread_bounds->min);
+    auto rep = inv->Forward(fwd).back();
+    AddPredicate(EQ(rep, 0));
+  }
+}
+Fragment
+ParallelOpNode::ChooseBestCandidate(const Fragment &candidate_from_buffer,
+                                    const Fragment &candidate_from_plan,
+                                    const LayoutInferArgs &T) const {
+  // Strategy overview:
+  // 1) Validate each candidate against all known source fragments. If only one
+  //    is compatible, choose it immediately.
+  // 2) If both are compatible, compare their containment relation:
+  //      - If buffer-based contains plan-based, prefer plan (usually smaller
+  //      rep).
+  //      - If plan-based contains buffer-based, prefer buffer.
+  // 3) If neither contains the other, prefer the one with provably smaller or
+  //    equal replication extent; otherwise fall back to buffer-based candidate.
+  // Note: Final global validation happens after selection elsewhere.
+  auto vars =
+      loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
+  auto contains = [&](const Fragment &big, const Fragment &small) {
+    // contains(A, B) means: for any loop index, the threads that access
+    // B's elements are a subset of those that access A's elements.
+    return ProveFragmentContains(small, big, vars, vars, analyzer_);
+  };
+
+  bool buf_ok = ValidateCandidateAgainstFragments(candidate_from_buffer, T);
+  bool plan_ok = ValidateCandidateAgainstFragments(candidate_from_plan, T);
+
+  if (buf_ok && !plan_ok) {
+    DLOG(INFO)
+        << "[FreeInfer] prefer compute_from_buffer (only valid candidate).";
+    return candidate_from_buffer;
+  }
+  if (plan_ok && !buf_ok) {
+    DLOG(INFO)
+        << "[FreeInfer] prefer PlanLoopPartition (only valid candidate).";
+    return candidate_from_plan;
+  }
+  if (!(buf_ok && plan_ok)) {
+    // Both invalid here; let the caller continue to final validation/throw.
+    // Returning buffer-based candidate keeps behavior deterministic.
+    return candidate_from_buffer; // arbitrary; caller will catch later
+  }
+
+  bool buf_contains_plan = contains(candidate_from_buffer, candidate_from_plan);
+  bool plan_contains_buf = contains(candidate_from_plan, candidate_from_buffer);
+
+  auto rep_buf = candidate_from_buffer->ReplicateExtent();
+  auto rep_plan = candidate_from_plan->ReplicateExtent();
+
+  // Prefer the contained candidate (tends to minimize replication while
+  // respecting access coverage):
+  if (buf_contains_plan && !plan_contains_buf) {
+    return candidate_from_plan;
+  }
+  if (plan_contains_buf && !buf_contains_plan) {
+    return candidate_from_buffer;
+  }
+  // Neither strictly contains the other; prefer the one with smaller/equal rep.
+  if (analyzer_.CanProve(rep_plan <= rep_buf)) {
+    return candidate_from_plan;
+  }
+  // Safe fallback: buffer-based candidate is always correct.
+  return candidate_from_buffer;
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/parallel.h b/src/op/parallel.h
index 8ebd7366e..751e14a22 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -9,7 +9,10 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <unordered_map>
+
 #include "../layout/layout.h"
+#include "../layout/utils.h"
 #include "../transform/layout_reducer.h"
 #include "./operator.h"
 
@@ -24,20 +27,6 @@ namespace tl {
 
 using namespace tir;
 
-class LayoutConflictException : public std::exception {
-public:
-  const char *what() const noexcept override { return msg_.c_str(); }
-  LayoutConflictException(const std::string &msg) : msg_(msg) {}
-
-private:
-  std::string msg_;
-};
-
-bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
-                           Array<PrimExpr> small_frag_indices,
-                           Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_);
-
 class ParallelOpNode;
 
 class ParallelLoopNestVisitor : public StmtExprVisitor {
@@ -57,13 +46,31 @@ class ParallelLoopNestVisitor : public StmtExprVisitor {
 // predicates.
 class ParallelOpNode : public TileOperatorNode {
 public:
+  struct BufferAccessInfo {
+    Array<PrimExpr> indices;
+    bool is_read = false;
+    bool is_write = false;
+  };
+
+  using BufferIndiceMap = std::unordered_map<Buffer, BufferAccessInfo,
+                                             ObjectPtrHash, ObjectPtrEqual>;
+
   // The root For loop node.
   For root_;
   // The inferred layout for the loop, mutable to allow lazy inference.
   mutable Fragment loop_layout_;
+  // Whether loop_layout_ was inferred within InferLayout (vs. provided via
+  // annotations). When true, subsequent InferLayout calls can early-exit
+  // without re-emitting buffer layout updates.
+  mutable bool loop_layout_inferred_ = false;
   // The predicate expression for the loop, if any, mutable for lazy
   // construction.
   mutable Optional<PrimExpr> predicate_;
+  // If the user/compiler provided annotations on the outermost loop, we cache
+  // them here (layout without thread-range binding, and the predicate). This
+  // lets InferLayout adopt them cleanly without re-parsing annotations.
+  mutable Optional<Fragment> annotated_layout_unbound_;
+  mutable Optional<PrimExpr> annotated_predicate_;
 
   // Type key for TVM object system.
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ParallelOp", ParallelOpNode,
@@ -91,14 +98,17 @@ class ParallelOpNode : public TileOperatorNode {
   ParallelOpNode(const ParallelOpNode &other) : ParallelOpNode(other.root_) {
     loop_layout_ = other.loop_layout_;
     predicate_ = other.predicate_;
+    loop_layout_inferred_ = other.loop_layout_inferred_;
+    annotated_layout_unbound_ = other.annotated_layout_unbound_;
+    annotated_predicate_ = other.annotated_predicate_;
   }
 
   // Get the inferred loop layout.
   Fragment GetLoopLayout() const { return loop_layout_; }
   // Get the root For loop.
   For GetRoot() const { return root_; }
-  // Get the mapping from buffer to access indices.
-  Map<Buffer, Array<PrimExpr>> GetIndiceMap() const { return indice_map_; }
+  // Get the mapping from buffer to access indices + access type.
+  const BufferIndiceMap &GetIndiceMap() const { return indice_map_; }
   // Get the predicate for a given thread variable.
   Optional<PrimExpr> GetPredicate(Var thread_var) const;
 
@@ -110,20 +120,58 @@ class ParallelOpNode : public TileOperatorNode {
   Fragment CompleteBufferFragment(const Buffer &buffer) const;
   // Check if the buffer is accessed with common indices (i.e., loop variables).
   bool IsCommonAccessIndice(const Buffer &buffer) const;
+  // Record buffer access and validate consistent indices.
+  void RecordBufferAccess(const Buffer &buffer, const Array<PrimExpr> &indices,
+                          bool is_write);
+  // Access info lookup with validation.
+  const BufferAccessInfo &GetAccessInfo(const Buffer &buffer) const;
+  // Check if a buffer is completely replicated (all threads hold same data).
+  bool IsBufferCompletelyReplicated(const Buffer &buffer,
+                                    const LayoutMap &layout_map) const;
+  // Validate a candidate loop layout against all source fragments in
+  // T.layout_map. Returns true if compatible with all fragments; otherwise
+  // false. When throw_on_error is true, throws LayoutConflictException with
+  // detailed error message on failure.
+  bool ValidateCandidateAgainstFragments(
+      const Fragment &candidate, const LayoutInferArgs &T,
+      bool throw_on_error = false, bool check_forward_index = false,
+      const Buffer &source_buffer = Buffer()) const;
+  // Choose the better loop layout from two candidates using validation,
+  // containment and replication heuristic.
+  Fragment ChooseBestCandidate(const Fragment &candidate_from_buffer,
+                               const Fragment &candidate_from_plan,
+                               const LayoutInferArgs &T) const;
+  // (No helper needed anymore; annotations are parsed once in ctor and adopted
+  // inside InferLayout.)
+  // Compute loop layout from a source buffer's fragment mapping.
+  Fragment ComputeLoopLayoutFromBuffer(const Buffer &buffer,
+                                       const LayoutInferArgs &T) const;
+  // Compute plan-based loop layout candidate using vectorization and thread
+  // bounds.
+  Fragment ComputePlanCandidate(const LayoutInferArgs &T) const;
+  // Add replication guard predicates when needed for cross-thread stores.
+  void BuildReplicationGuardsIfNeeded(
+      const LayoutInferArgs &T,
+      const std::vector<Buffer> &store_shared_global_buffers,
+      const std::vector<Buffer> &store_fragment_buffers,
+      bool has_cross_thread_access,
+      const std::vector<Buffer> &const_index_fragment_buffer) const;
   // Add a predicate to the current predicate expression.
   void AddPredicate(const PrimExpr &expr) const {
     predicate_ = predicate_.defined() ? And(expr, predicate_.value()) : expr;
   }
+  // Expand let bindings to find fragment buffer accesses and add them to
+  // indice_map_. This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0],
+  // ...)
+  void ExpandLetBindings(const Map<Var, PrimExpr> &let_var_to_expr);
 
   // Allow ParallelLoopNestVisitor to access private members.
   friend class ParallelLoopNestVisitor;
 
   // Visitor for collecting loop nest information.
   ParallelLoopNestVisitor V;
-  // Mapping from buffer to their access indices in the loop.
-  Map<Buffer, Array<PrimExpr>> indice_map_;
-  // Set of buffers that are written to in the loop.
-  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_is_write_;
+  // Mapping from buffer to their access indices and access type in the loop.
+  BufferIndiceMap indice_map_;
   // The loop variables for the parallel loop nest.
   Array<IterVar> loop_vars_;
   // The inner_vars_
@@ -132,6 +180,12 @@ class ParallelOpNode : public TileOperatorNode {
   mutable arith::Analyzer analyzer_;
   // Mapping from buffer to reducer info.
   Map<Var, ReducerInfo> reducer_info_map_;
+  // Whether the loop body has cross-thread shared/global memory access.
+  bool has_cross_thread_access_ = false;
+  // Buffers that are stored to shared/global memory in the loop body.
+  std::vector<Buffer> store_shared_global_buffers_;
+  // Fragment buffers that are stored to in the loop body.
+  std::vector<Buffer> store_fragment_buffers_;
 };
 
 class ParallelOp : public TileOperator {
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index c9d83cb1f..765ec1786 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -10,21 +10,33 @@
 #include <tvm/tir/op_attr_types.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include "../layout/layout.h"
 #include "../layout/utils.h"
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/expr.h"
+#include "tvm/tir/stmt.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
+
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
+
+ReduceOp::ReduceOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  // Accept BufferRegion/BufferLoad for src/dst
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
   node->dim = args[3].as<IntImm>().value()->value;
   node->type = ReduceType(reduce_type);
@@ -89,28 +101,28 @@ PrimExpr ReduceOpNode::MakeInitValue() const {
   }
 }
 
-PrimExpr ReduceOpNode::MakeReduce(const PrimExpr &lhs,
+PrimExpr ReduceOpNode::MakeReduce(const PrimExpr &acc,
                                   const PrimExpr &b) const {
   PrimExpr rhs = b;
-  if (lhs->dtype != rhs->dtype) {
-    rhs = Cast(lhs->dtype, rhs);
+  if (acc->dtype != rhs->dtype) {
+    rhs = Cast(acc->dtype, rhs);
   }
   if (type->isSum()) {
-    return lhs + rhs;
+    return acc + rhs;
   } else if (type->isAbsSum()) {
-    return lhs + Max(rhs, -rhs);
+    return acc + Max(rhs, -rhs);
   } else if (type->isMax()) {
-    return Max(lhs, rhs);
+    return Max(acc, rhs);
   } else if (type->isMin()) {
-    return Min(lhs, rhs);
+    return Min(acc, rhs);
   } else if (type->isAbsMax()) {
-    return Max(tvm::abs(lhs), tvm::abs(rhs));
+    return Max(acc, tvm::abs(rhs));
   } else if (type->isBitAnd()) {
-    return lhs & rhs;
+    return acc & rhs;
   } else if (type->isBitOr()) {
-    return lhs | rhs;
+    return acc | rhs;
   } else if (type->isBitXor()) {
-    return lhs ^ rhs;
+    return acc ^ rhs;
   } else {
     LOG(FATAL) << "Unsupported reduce type: " << type->type;
   }
@@ -139,6 +151,40 @@ std::string ReduceOpNode::MakeCodegenReducer() const {
   }
 }
 
+static Array<PrimExpr> InputPlaceholders(size_t n) {
+  Array<PrimExpr> result;
+  result.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    result.push_back(InputPlaceholder(i));
+  }
+  return result;
+}
+
+static Fragment ComputeReducerLayout(const Fragment &src_layout, int dim) {
+  PrimExpr src_rep_extent = src_layout->ReplicateExtent();
+  PrimExpr indice_rep_extent = src_layout->InputShape()[dim];
+  PrimExpr reducer_rep_extent = indice_rep_extent * src_rep_extent;
+
+  auto fwd = InputPlaceholders(src_layout->InputDim() - 1);
+  fwd.insert(fwd.begin() + dim,
+             FloorMod(ReplicationPlaceholder(), indice_rep_extent));
+
+  auto thd = src_layout->ForwardThread(
+      fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
+
+  auto reducer_shape = src_layout->InputShape();
+  reducer_shape.erase(reducer_shape.begin() + dim);
+  if (reducer_shape.empty()) {
+    reducer_shape.push_back(1);
+  }
+
+  auto reducer_layout =
+      Fragment(reducer_shape, {}, thd, reducer_rep_extent, std::nullopt)
+          ->CondenseReplicateVar()
+          ->BindThreadRange(src_layout->ThreadRange());
+  return reducer_layout;
+}
+
 /**
  * @brief Lower the Reduce operator to a TIR statement.
  *
@@ -162,7 +208,7 @@ std::string ReduceOpNode::MakeCodegenReducer() const {
  * - Detects parallel thread splitting from the normalized iterator sum and
  *   emits a call to a templated `tl::AllReduce<...>::run` (or `run_hopper`)
  *   via `builtin::call_extern`. For sufficiently large reducing thread counts
- *   (>= 32) a workspace is allocated via T.AddWorkspace and passed to the
+ *   (> 32) a workspace is allocated via T.AddWorkspace and passed to the
  *   AllReduce call.
  * - The final body is wrapped in parallel loops over the destination spatial
  *   dimensions and partitioned by the lowering thread variable. If a temporary
@@ -186,14 +232,16 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto dst_scope = this->dst.scope();
 
   if (src_scope == "local.fragment" && dst_scope == "local.fragment") {
-    Buffer src_buffer = get_buffer(this->src);
-    Buffer dst_buffer = get_buffer(this->dst);
-    Fragment src_layout = T.layout_map[this->src].as<Fragment>().value();
-    Fragment dst_layout = T.layout_map[this->dst].as<Fragment>().value();
-    size_t src_dim = src_layout->InputDim();
-    size_t dst_dim = dst_layout->InputDim();
 
-    bool is_1d_reduce = src_dim == dst_dim && dst_dim == 1;
+    auto src_buffer = get_buffer(this->src);
+    auto dst_buffer = get_buffer(this->dst);
+    auto src_layout = T.layout_map[this->src].as<Fragment>().value();
+    auto dst_layout = T.layout_map[this->dst].as<Fragment>().value();
+    auto red_layout = ComputeReducerLayout(src_layout, dim);
+    auto src_dim = src_layout->InputDim();
+    auto dst_dim = dst_layout->InputDim();
+
+    auto is_1d_reduce = src_dim == dst_dim && dst_dim == 1;
 
     if (is_1d_reduce) {
       ICHECK(is_one(dst_layout->OutputShape().back()))
@@ -217,60 +265,76 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     IterVar reduce_iv(reduce_dom, Var("rv"), IterVarType::kDataPar);
     src_vars.insert(src_vars.begin() + this->dim, reduce_iv);
 
-    Array<PrimExpr> src_indices = src_layout->Forward(
+    auto src_indices = src_layout->Forward(
         src_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
-    Array<PrimExpr> dst_indices = dst_layout->Forward(
+    auto dst_indices = dst_layout->Forward(
+        dst_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
+    auto red_indices = red_layout->Forward(
         dst_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
 
     Array<Stmt> stmts;
 
-    bool require_init = this->clear;
+    auto require_init = this->clear;
     if (this->type->isSum() || this->type->isAbsSum() ||
         this->type->isBitAnd() || this->type->isBitOr() ||
         this->type->isBitXor()) {
       require_init = true;
     }
 
-    Buffer clear_buffer = dst_buffer;
-    bool need_duplicate = false;
+    auto clear_buffer = dst_buffer;
+    auto need_duplicate = false;
+    auto need_update = false;
     if ((this->type->isSum() || this->type->isAbsSum()) && !this->clear) {
       need_duplicate = true;
+      need_update = true;
     } else if (this->type->isBitAnd() && !this->clear) {
       need_duplicate = true;
+      need_update = true;
     } else if ((this->type->isBitOr() || this->type->isBitXor()) &&
                !this->clear) {
       need_duplicate = true;
+      need_update = true;
     }
 
+    // red_layout should always contain dst_layout
+    // if we can prove they are the same, no need to duplicate buffer
+    // otherwise, red_layout contains more replicated dimensions than dst_layout
+    if (!analyzer->CanProve(dst_layout->ReplicateExtent() ==
+                            red_layout->ReplicateExtent())) {
+      need_duplicate = true;
+    }
+    ICHECK(!analyzer->CanProve(dst_layout->ReplicateExtent() >
+                               red_layout->ReplicateExtent()))
+        << "Inconsistent layouts between src and dst in ReduceOp: "
+        << "dst_layout=" << dst_layout << "red_layout=" << red_layout;
+
     if (need_duplicate) {
       // Create a new buffer with same shape and dtype as dst_buffer
-      clear_buffer = decl_buffer(dst_buffer->shape, dst_buffer->dtype,
+      clear_buffer = decl_buffer(red_layout->OutputShape(), dst_buffer->dtype,
                                  dst_buffer->name + "_clear",
                                  GetPtrStorageScope(dst_buffer->data));
     }
     // make reduce-init stmt
     if (require_init) {
       stmts.push_back(
-          BufferStore(clear_buffer, this->MakeInitValue(), dst_indices));
+          BufferStore(clear_buffer, this->MakeInitValue(), red_indices));
     }
 
     // make thread-local reduce
     Array<PrimExpr> src_indice_compressed;
     Array<IterVar> src_var_compressed;
     for (size_t i = 0; i < src_layout->OutputDim(); ++i) {
-      PrimExpr expr;
-      IterVar var;
-      std::tie(expr, var) = CompressIterator(
-          src_indices[i], src_vars, src_vars[this->dim]->var, analyzer);
+      auto [expr, var] = CompressIterator(src_indices[i], src_vars,
+                                          src_vars[this->dim]->var, analyzer);
       src_indice_compressed.push_back(expr);
       src_var_compressed.push_back(var);
     }
 
     Stmt reduce_local = BufferStore(
         clear_buffer,
-        this->MakeReduce(BufferLoad(clear_buffer, dst_indices),
+        this->MakeReduce(BufferLoad(clear_buffer, red_indices),
                          BufferLoad(src_buffer, src_indice_compressed)),
-        dst_indices);
+        red_indices);
 
     for (int i = static_cast<int>(src_layout->OutputDim()) - 1; i >= 0; --i) {
       reduce_local =
@@ -280,7 +344,7 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     }
     stmts.push_back(reduce_local);
 
-    PrimExpr src_thread = src_layout->ForwardThread(
+    auto src_thread = src_layout->ForwardThread(
         src_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }), {});
     auto iter_sum =
         arith::NormalizeToIterSum(src_thread, ToVMap(src_vars), analyzer);
@@ -298,7 +362,8 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         std::stringstream ss;
 
         auto thread_offset = T.thread_bounds->min;
-        if (TargetIsHopper(T.target) || TargetIsSm100(T.target)) {
+        if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
+            TargetIsSM120(T.target)) {
           auto all_threads = T.thread_bounds->extent;
           ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
              << reducing_threads << ", " << (*scale) << ", " << thread_offset
@@ -309,37 +374,72 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
              << ">::run";
         }
         Array<PrimExpr> thread_reduce_args = {
-            StringImm(ss.str()), BufferLoad(clear_buffer, dst_indices)};
-        if (reducing_threads >= 32) {
+            StringImm(ss.str()), BufferLoad(clear_buffer, red_indices)};
+        if (reducing_threads > 32) {
           PrimExpr workspace = T.AddWorkspace(
               *as_const_int(T.thread_bounds->extent), clear_buffer->dtype);
           thread_reduce_args.push_back(workspace);
         }
         auto call = Call(clear_buffer->dtype, builtin::call_extern(),
                          thread_reduce_args);
-        stmts.push_back(BufferStore(clear_buffer, call, dst_indices));
+        stmts.push_back(BufferStore(clear_buffer, call, red_indices));
       }
     }
 
+    // Layout status in the loop:
+    //     clear_buffer: red_layout
+    //     dst_buffer:   dst_layout
+    //     loop_layout:  red_layout
+    // At each step of the loop, we do reduction on
+    // `clear_buffer[red_layout(loop_idx)]`
+    //   and then transfer it to `dst_buffer[dst_layout(loop_idx)]`
+    // However, since the red_layout is larger than dst_layout, not all write
+    // operations are valid We need to add predicate to guard the write
+    // operations
+    PrimExpr predicate = Bool(true);
+    {
+      // dst_indices is the same as loop_indices
+      auto dst_th_indices = dst_indices;
+      dst_th_indices.push_back(T.thread_var);
+      // 1. compute loop_idx based on thread: [dst_indices, T.thread_var] =>
+      // [loop_indices]
+      auto inv = dst_layout->Inverse()->Forward(dst_th_indices);
+      inv.pop_back(); // remove replicate var
+      // 2. ensure computed loop_idx maps back to the same [loop_indices]
+      for (int i = 0; i < static_cast<int>(dst_layout->InputDim()); i++) {
+        predicate = predicate && (inv[i] == dst_vars[i]->var);
+      }
+      // 3. simplify predicate
+      predicate = analyzer->Simplify(predicate);
+    }
     if (need_duplicate) {
-      PrimExpr src_val = BufferLoad(clear_buffer, dst_indices);
-      PrimExpr dst_val = BufferLoad(dst_buffer, dst_indices);
       PrimExpr update;
-      if (this->type->isSum() || this->type->isAbsSum()) {
-        update = dst_val + src_val;
-      } else if (this->type->isBitAnd()) {
-        update = this->clear ? src_val : bitwise_and(dst_val, src_val);
-      } else if (this->type->isBitOr()) {
-        update = bitwise_or(dst_val, src_val);
-      } else if (this->type->isBitXor()) {
-        update = bitwise_xor(dst_val, src_val);
+      if (need_update) {
+        auto src_val = BufferLoad(clear_buffer, red_indices);
+        auto dst_val = BufferLoad(dst_buffer, dst_indices);
+        if (this->type->isSum() || this->type->isAbsSum()) {
+          update = dst_val + src_val;
+        } else if (this->type->isBitAnd()) {
+          update = this->clear ? src_val : bitwise_and(dst_val, src_val);
+        } else if (this->type->isBitOr()) {
+          update = bitwise_or(dst_val, src_val);
+        } else if (this->type->isBitXor()) {
+          update = bitwise_xor(dst_val, src_val);
+        } else {
+          LOG(FATAL) << "Unsupported reduce type: " << this->type->type;
+        }
+      } else {
+        update = BufferLoad(clear_buffer, red_indices);
+      }
+      auto store = BufferStore(dst_buffer, update, dst_indices);
+      if (analyzer->CanProve(predicate)) {
+        stmts.push_back(store);
       } else {
-        LOG(FATAL) << "Unsupported reduce type: " << this->type->type;
+        stmts.push_back(IfThenElse(predicate, store));
       }
-      stmts.push_back(BufferStore(dst_buffer, update, dst_indices));
     }
 
-    Stmt body = stmts.size() > 1 ? SeqStmt(stmts) : stmts[0];
+    auto body = stmts.size() > 1 ? SeqStmt(stmts) : stmts[0];
     for (int i = static_cast<int>(dst_layout->InputDim()) - 1; i >= 0; --i) {
       body = For(dst_vars[i]->var, 0, dst_vars[i]->dom->extent,
                  ForKind::kParallel, body);
@@ -347,9 +447,9 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
     if (dst_layout->InputDim() > 0) {
       body = PartitionLoop(Downcast<For>(body), T.thread_var, analyzer,
-                           dst_layout);
+                           red_layout);
     } else {
-      PrimExpr guard = (T.thread_var == T.thread_bounds->min);
+      auto guard = (T.thread_var == T.thread_bounds->min);
       body = IfThenElse(guard, body);
     }
 
@@ -369,109 +469,57 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   if (level >= InferLevel::kStrict)
     return {};
-  if (src.scope() == "local.fragment" && dst.scope() == "local.fragment" &&
+
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst) &&
       T.layout_map.count(src)) {
     auto src_layout = T.layout_map[src].as<Fragment>().value();
+    auto reducer_layout = ComputeReducerLayout(src_layout, this->dim);
 
-    PrimExpr indice_rep_extent = src->shape[dim];
-    PrimExpr src_rep_extent = src_layout->ReplicateExtent();
-    PrimExpr dest_buffer_rep_extent = indice_rep_extent * src_rep_extent;
-
-    Array<PrimExpr> fwd;
-    for (int i = 0; i < static_cast<int>(src->shape.size()); i++) {
-      if (i == dim) {
-        fwd.push_back(FloorMod(ReplicationPlaceholder(), indice_rep_extent));
-      } else if (i < dim) {
-        fwd.push_back(InputPlaceholder(i));
-      } else if (i > dim) {
-        fwd.push_back(InputPlaceholder(i - 1));
-      }
+    if (!T.layout_map.count(dst)) {
+      return {{dst, reducer_layout}};
     }
-    auto thd = src_layout->ForwardThread(
-        fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
 
-    // Ensure the thread count is divisible by the replicate extent.
-    // Otherwise, we cannot infer a valid fragment<->fragment layout.
-    {
-      arith::Analyzer analyzer;
-      PrimExpr num_threads = T.thread_bounds->extent;
-      // Though the dest_buffer_rep_extent will be compressed at
-      // CondenseReplicateVar, we need to check the divisibility here to avoid
-      // the issue that the thread count is not divisible by the replicate
-      // extent.
-      if (!analyzer.CanProve(FloorMod(num_threads, dest_buffer_rep_extent) ==
-                             0) &&
-          !analyzer.CanProve(FloorMod(dest_buffer_rep_extent, num_threads) ==
-                             0)) {
-        ICHECK(false) << "ReduceOp fragment layout inference failed: "
-                         "num_threads % replicate_extent != 0. "
-                      << "This mapping requires the block's thread count to be "
-                         "divisible by the "
-                      << "replicate extent. "
-                      << "Try one of: (1) choose a thread block size divisible "
-                         "by replicate_extent; "
-                      << "(2) pick a different reduce dimension or adjust the "
-                         "source fragment layout; "
-                      << "Details: num_threads=" << num_threads
-                      << ", replicate_extent=" << indice_rep_extent
-                      << ", src=" << src << ", dst=" << dst;
-      }
-    }
-
-    Fragment dst_layout =
-        Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, std::nullopt)
-            ->CondenseReplicateVar()
-            ->BindThreadRange(T.thread_bounds);
-    if (!T.layout_map.count(dst))
-      return {{dst, dst_layout}};
-    else {
-      // Check if computed layout is compatible with existing: the existing one
-      // must strictly contains the computed layout
-      auto orig_dst_layout =
-          T.layout_map.Get(dst).value().as<Fragment>().value();
-      ICHECK(dst_layout->InputDim() == orig_dst_layout->InputDim());
-      Array<PrimExpr> indices;
-      indices.reserve(dst_layout->InputDim());
-      arith::Analyzer inner_analyzer;
-      for (int i = 0; i < dst_layout->InputDim(); ++i) {
-        auto x = InputPlaceholder(i);
-        indices.push_back(x);
-        // should be literal - literal = 0, any analyzer will work
-        ICHECK(is_zero(inner_analyzer.Simplify(
-            dst_layout->InputShape()[i] - orig_dst_layout->InputShape()[i])));
-        inner_analyzer.Bind(x, Range(0, dst_layout->InputShape()[i]));
-      }
+    auto orig_dst_layout = T.layout_map.Get(dst).value().as<Fragment>().value();
+    ICHECK(reducer_layout->InputDim() == orig_dst_layout->InputDim());
 
-      ICHECK(as_const_int(dst_layout->ReplicateExtent()));
-      ICHECK(as_const_int(src_layout->ReplicateExtent()));
-      auto dst_rep = *as_const_int(dst_layout->ReplicateExtent());
-      auto src_rep = *as_const_int(src_layout->ReplicateExtent());
-      if (dst_rep < src_rep ||
-          !ProveFragmentContains(orig_dst_layout, dst_layout, indices, indices,
-                                 inner_analyzer)) {
-        std::ostringstream oss;
-        oss << "Layout may conflict with ReduceOp for buffer " << dst << " vs. "
-            << src << "\nLHS = " << src_layout->DebugOutput()
-            << "\nRHS = " << orig_dst_layout->DebugOutput()
-            << "\nYou may need to use a shared memory to transform the "
-               "layout";
-        throw LayoutConflictException(oss.str());
-      }
-
-      if (dst_rep > src_rep) {
-        return {{dst, dst_layout}};
-      }
+    auto indices = InputPlaceholders(reducer_layout->InputDim());
+    arith::Analyzer analyzer;
+    for (size_t i = 0; i < indices.size(); i++) {
+      analyzer.Bind(Downcast<Var>(indices[i]),
+                    Range(0, reducer_layout->InputShape()[i]));
+    }
+    if (!ProveFragmentContains(orig_dst_layout, reducer_layout, indices,
+                               indices, analyzer)) {
+      std::ostringstream oss;
+      oss << "Layout may conflict with ReduceOp for buffer " << dst << " vs. "
+          << src << "\n"
+          << "src_layout = " << src_layout << "\n"
+          << "reducer_layout = " << reducer_layout << "\n"
+          << "orig_dst_layout = " << orig_dst_layout << "\n"
+          << "You may need to use a shared memory to transform the "
+             "layout";
+      throw LayoutConflictException(oss.str());
     }
   }
   return {};
 }
 
-TIR_REGISTER_TL_OP(ReduceOp, reduce)
+TIR_REGISTER_TL_TILE_OP(ReduceOp, reduce)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
+// Normalize "Buffer" to BufferRegion. Use the shape of the buffer as the
+// ranges.
+static BufferRegion ConvertBufferToBufferRegion(const Buffer &buf) {
+  Array<Range> ranges;
+  for (PrimExpr extent : buf->shape) {
+    ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+  }
+  return BufferRegion(buf, ranges);
+}
+
+CumSumOp::CumSumOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   /// CumSum constructor arguments:
   /// - src: input buffer
   /// - dst: output buffer
@@ -479,17 +527,24 @@ CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
   /// - reverse: whether to cumsum in reverse order
   CHECK_EQ(args.size(), 4);
   ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  // node->src = vmap[GetVarFromAccessPtr(args[0])];
+  // node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
   node->reverse = args[3].as<Bool>().value();
-  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()));
+  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()))
+      << "The dim of cumsum should be less than the number of dimensions. Got "
+         "dim="
+      << node->dim << ", but src has " << node->src->shape.size() << " dims.";
+
   data_ = std::move(node);
 }
 
 Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  if (this->src.scope() == "local.fragment" &&
-      this->dst.scope() == "local.fragment") {
+  if (IsFragmentBuffer(this->src) && IsFragmentBuffer(this->dst)) {
     LOG(FATAL) << "CumSum for fragment not implemented, please raise an issue "
                   "if you need this feature.";
   } else if (this->src.scope() == "shared.dyn" ||
@@ -498,19 +553,29 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     std::stringstream ss;
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
-    int ndim = static_cast<int>(src->shape.size());
+
+    // Build access pointers from regions locally
+    PrimExpr srcPtr = MakeAccessPtrFromRegion(srcRegion_, 1);
+    PrimExpr dstPtr = MakeAccessPtrFromRegion(dstRegion_, 2);
+
+    // Use region extents instead of buffer shape for correct slice handling
+    Array<PrimExpr> src_extents;
+    for (const auto &range : srcRegion_->region) {
+      src_extents.push_back(range->extent);
+    }
+    int ndim = static_cast<int>(src_extents.size());
+
     if (ndim == 1) {
       ICHECK_EQ(dim, 0) << "Cumulative sum over a 1D buffer only supports dim "
                            "= 0.";
       ss << "tl::CumSum1D<" << threads << ", " << (reverse ? "true" : "false")
          << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0]};
     } else if (ndim == 2) {
       ss << "tl::CumSum2D<" << threads << ", " << dim << ", "
          << (reverse ? "true" : "false") << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0], src->shape[1]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0],
+              src_extents[1]};
     } else {
       LOG(FATAL) << "CumSum currently supports only 1D or 2D buffers, got "
                  << ndim << "D.";
@@ -526,10 +591,40 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
 LayoutMap CumSumOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
-  return {};
+  // Only infer layout in strict mode
+  if (level != InferLevel::kStrict) {
+    return {};
+  }
+
+  LayoutMap result_map;
+
+  auto make_linear_layout = [](const Buffer &buf) -> Layout {
+    return makeLinearLayout(buf->shape);
+  };
+
+  auto check_or_set_linear_layout = [&](const Buffer &buf) {
+    if (!IsSharedBuffer(buf))
+      return;
+
+    Layout linear_layout = make_linear_layout(buf);
+    if (T.layout_map.count(buf)) {
+      // Check if existing layout is linear
+      Layout existing = T.layout_map.Get(buf).value().as<Layout>().value();
+      ICHECK(StructuralEqual()(existing, linear_layout))
+          << "CumSum requires linear layout for shared buffer " << buf->name
+          << ", but got non-linear layout.";
+    } else {
+      result_map.Set(buf, linear_layout);
+    }
+  };
+
+  check_or_set_linear_layout(src);
+  check_or_set_linear_layout(dst);
+
+  return result_map;
 }
 
-TIR_REGISTER_TL_OP(CumSumOp, cumsum)
+TIR_REGISTER_TL_TILE_OP(CumSumOp, cumsum)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 93eb4bdec..636abdd94 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -82,9 +82,11 @@ class ReduceType : public ObjectRef {
 class ReduceOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension to reduce along
-  ReduceType type;      ///< Type of reduction operation
-  bool clear;           ///< Whether to clear destination before reduction
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;         ///< Dimension to reduce along
+  ReduceType type; ///< Type of reduction operation
+  bool clear;      ///< Whether to clear destination before reduction
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReduceOp", ReduceOpNode,
                                     TileOperatorNode);
@@ -94,6 +96,8 @@ class ReduceOpNode : public TileOperatorNode {
     refl::ObjectDef<ReduceOpNode>()
         .def_ro("src", &ReduceOpNode::src)
         .def_ro("dst", &ReduceOpNode::dst)
+        .def_ro("srcRegion", &ReduceOpNode::srcRegion_)
+        .def_ro("dstRegion", &ReduceOpNode::dstRegion_)
         .def_ro("dim", &ReduceOpNode::dim)
         .def_ro("type", &ReduceOpNode::type)
         .def_ro("clear", &ReduceOpNode::clear);
@@ -111,7 +115,7 @@ class ReduceOpNode : public TileOperatorNode {
   /// Generate initial value for reduction
   PrimExpr MakeInitValue() const;
   /// Generate reduction expression
-  PrimExpr MakeReduce(const PrimExpr &a, const PrimExpr &b) const;
+  PrimExpr MakeReduce(const PrimExpr &acc, const PrimExpr &b) const;
   /// Generate codegen reducer string
   std::string MakeCodegenReducer() const;
 };
@@ -121,7 +125,9 @@ class ReduceOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReduceOp, TileOperator,
                                              ReduceOpNode);
-  TVM_DLL ReduceOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL
+  ReduceOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
@@ -129,8 +135,10 @@ class ReduceOp : public TileOperator {
 class CumSumOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension along which to compute cumulative sum
-  bool reverse;         ///< Whether to compute in reverse order
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;      ///< Dimension along which to compute cumulative sum
+  bool reverse; ///< Whether to compute in reverse order
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.CumSumOp", CumSumOpNode,
                                     TileOperatorNode);
 
@@ -139,6 +147,8 @@ class CumSumOpNode : public TileOperatorNode {
     refl::ObjectDef<CumSumOpNode>()
         .def_ro("src", &CumSumOpNode::src)
         .def_ro("dst", &CumSumOpNode::dst)
+        .def_ro("srcRegion", &CumSumOpNode::srcRegion_)
+        .def_ro("dstRegion", &CumSumOpNode::dstRegion_)
         .def_ro("dim", &CumSumOpNode::dim)
         .def_ro("reverse", &CumSumOpNode::reverse);
   }
@@ -155,7 +165,9 @@ class CumSumOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(CumSumOp, TileOperator,
                                              CumSumOpNode);
-  TVM_DLL CumSumOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL
+  CumSumOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/region.cc b/src/op/region.cc
index e4984af13..4776edd55 100644
--- a/src/op/region.cc
+++ b/src/op/region.cc
@@ -1,7 +1,14 @@
 /*!
  * \file tl/op/region.cc
- * \brief Define region operator.
+ * \brief Define region operator (bridge to carry BufferRegion via Call args).
  *
+ * Notes:
+ * - BufferLoad/Ramp cannot represent a general PrimExpr as a vector lane
+ *   count. Dynamic extents like (H1 - H0) cannot be encoded as
+ *   Ramp(lanes = H1 - H0), and lowering BufferRegion to BufferLoad loses the
+ *   explicit extent information.
+ * - tl.region carries both mins and extents in Call args and lets the backend
+ *   reconstruct a BufferRegion faithfully.
  */
 
 #include "region.h"
@@ -11,27 +18,7 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 
-/**
- * @brief Construct a RegionOp from TL operator arguments.
- *
- * Parses the TL `region` operator call arguments to populate the RegionOpNode:
- * - Expects args[0] to be a `BufferLoad` whose `indices` are the per-dimension
- * minima.
- * - args[1] must be a constant integer used as the access mask.
- * - args[2 + i] provides the extent for dimension `i`.
- *
- * The constructor validates that the number of load indices equals `args.size()
- * - 2` and will abort via ICHECK on mismatch or if args[0] is not a
- * `BufferLoad`.
- *
- * Parameters:
- * - args: TL operator call arguments in the form
- *     [BufferLoad(min_i...), access_mask, extent_0, extent_1, ...,
- * extent_{n-1}] where n = number of dimensions.
- * - vmap: BufferMap passed through by the caller (not documented here as a
- * generic utility).
- */
-RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
+RegionOp::RegionOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   size_t n = args.size();
   size_t ndim = n - 2;
   auto load = args[0].as<BufferLoadNode>();
@@ -39,10 +26,24 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   ICHECK(load->indices.size() == ndim)
       << "load->indices.size() = " << load->indices << " ndim = " << ndim;
   Array<Range> ranges;
+  // Rebuild per-axis ranges from mins (BufferLoad indices) and provided extents
   for (size_t i = 0; i < ndim; i++) {
-    PrimExpr min = load->indices[i];
+    PrimExpr index = load->indices[i];
     PrimExpr extent = args[2 + i];
-    ranges.push_back(Range::FromMinExtent(min, extent));
+    if (const auto *ramp = index.as<RampNode>()) {
+      const auto *stride_imm = ramp->stride.as<IntImmNode>();
+      ICHECK(stride_imm && stride_imm->value == 1)
+          << "RegionOp expects stride-1 Ramp for index";
+      if (const auto *lanes_imm = ramp->lanes.as<IntImmNode>()) {
+        if (const auto *ext_imm = extent.as<IntImmNode>()) {
+          ICHECK_EQ(lanes_imm->value, ext_imm->value)
+              << "Ramp lanes and provided extent must match";
+        }
+      }
+      ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+    } else {
+      ranges.push_back(Range::FromMinExtent(index, extent));
+    }
   }
   ObjectPtr<RegionOpNode> node = tvm::ffi::make_object<RegionOpNode>();
   node->buffer_ = load->buffer;
@@ -51,26 +52,11 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a copy of this RegionOpNode and return it as a TileOperator.
- *
- * @return TileOperator A new TileOperator that owns a copied RegionOpNode.
- */
 TileOperator RegionOpNode::Clone() const {
   auto op = tvm::ffi::make_object<RegionOpNode>(*this);
   return RegionOp(op);
 }
 
-/**
- * @brief Check whether the region spans the entire underlying buffer.
- *
- * Returns true if for every dimension the range minimum is zero and the
- * range extent is structurally equal to the corresponding buffer shape
- * dimension. Otherwise returns false.
- *
- * @return true if the region covers the full buffer in all dimensions; false
- * otherwise.
- */
 bool RegionOpNode::IsFullRegion() const {
   for (size_t i = 0; i < ranges_.size(); i++) {
     if (!is_zero(ranges_[i]->min))
@@ -81,39 +67,16 @@ bool RegionOpNode::IsFullRegion() const {
   return true;
 }
 
-/**
- * @brief Lower the region operator to a TIR statement.
- *
- * Lowers this RegionOpNode into a TIR Stmt by delegating to the operator's
- * evaluation path (currently `Evaluate(0)`).
- *
- * @param T Lowering context (provides buffers, producers/consumers and other
- *          environment required for lowering).
- * @param analyzer Optional arithmetic analyzer used for simplification during
- *                 lowering.
- * @return Stmt The lowered TIR statement representing this region operation.
- */
 Stmt RegionOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return Evaluate(0);
 }
 
-/**
- * @brief Infers data layout for the region operator.
- *
- * This operator does not provide any layout inference; the function always
- * returns an empty LayoutMap regardless of the provided arguments or inference
- * level.
- *
- * @param T Layout inference arguments (ignored).
- * @param level Inference granularity level (ignored).
- * @return LayoutMap Empty map indicating no inferred layouts.
- */
 LayoutMap RegionOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   return {};
 }
 
-TIR_REGISTER_TL_OP(RegionOp, region)
+TIR_REGISTER_TL_TILE_OP(RegionOp, region)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
diff --git a/src/op/region.h b/src/op/region.h
index e5c478bff..5f013eca6 100644
--- a/src/op/region.h
+++ b/src/op/region.h
@@ -1,74 +1,36 @@
 /*!
- * \file tl/op/op.h
- * \brief Tile library operations.
+ * \file tl/op/region.h
+ * \brief Tile memory region descriptor op (bridge to carry BufferRegion via
+ * Call args).
  *
+ * Why tl.region instead of passing BufferRegion directly?
+ *
+ * - While TIR can represent a BufferRegion, when a BufferRegion is passed as a
+ *   call argument through call_intrin/FFI, the Python->C++ conversion lowers it
+ *   to a BufferLoad(indices). To encode an interval inside indices, the FFI
+ *   typically uses Ramp(base, stride, lanes) to represent a contiguous slice.
+ * - Ramp(lanes) may only be a constant or vscale*k (scalable vector). A general
+ *   PrimExpr (e.g., H1 - H0) is not allowed as lanes, so dynamic extents would
+ *   make the lowered BufferLoad invalid.
+ * - Moreover, BufferLoad only carries indices, not per-axis extents. Downstream
+ *   tile operators (e.g., tl.copy, tl.reduce) that require both min and extent
+ *   cannot losslessly recover dynamic extents from a BufferLoad alone.
+ *
+ * tl.region is a small transport-only op that solves this:
+ * - The frontend packs buffer + mins (from BufferLoad.indices) + extents into
+ *   Call args, allowing dynamic extents to be expressed explicitly.
+ * - The backend (NormalizeToBufferRegion) reconstructs a BufferRegion from the
+ *   tl.region call without losing information.
+ * - The op itself carries no semantics in Lower/InferLayout and is only used as
+ *   a bridge for argument passing.
  */
 
 #ifndef TVM_TL_OP_REGION_H_
 #define TVM_TL_OP_REGION_H_
 
 #include "./operator.h"
-#include <tvm/arith/analyzer.h>
-#include <tvm/ir/op.h>
-#include <tvm/target/target.h>
 #include <tvm/tir/buffer.h>
 
-/**
- * Tile operator representing a memory region (buffer + ranges) used by TL
- * passes.
- *
- * Encapsulates the target tir::Buffer, the region extents as an Array<Range>,
- * and an access mask that indicates permitted or intended accesses for lowering
- * and layout inference.
- */
-
-/**
- * Lower this RegionOp into a TIR statement representing the region access.
- *
- * @param T Lowering-time arguments (e.g., loop/build context and value
- * mappings).
- * @param analyzer Arithmetic analyzer used to simplify and reason about
- * expressions.
- * @return A tir::Stmt that implements the region access/mutation described by
- * this operator.
- */
-
-/**
- * Infer the layout mapping for this region operator.
- *
- * Produces a LayoutMap describing how loop/axis indices map to buffer axes for
- * layout-aware scheduling and subsequent operators.
- *
- * @param T Layout inference arguments (e.g., input layouts and shapes).
- * @param level The inference detail level to use.
- * @return A LayoutMap describing inferred mappings for the operator.
- */
-
-/**
- * Return true when this RegionOp represents the full buffer region (i.e.,
- * ranges cover the entire buffer extent).
- */
-
-/**
- * Create a shallow copy of this operator as a TileOperator handle.
- *
- * @return A TileOperator that references a cloned RegionOpNode.
- */
-
-/**
- * Construct a RegionOp from argument expressions and a buffer map.
- *
- * @param args Positional expressions used to instantiate the operator
- * (semantics depend on how RegionOp is invoked in TL pipelines).
- * @param vmap Mapping from Buffer to replacement Buffer or buffer metadata used
- * during creation.
- */
-
-/**
- * Return the global Op registration for RegionOp.
- *
- * @return Reference to the registered tvm::Op describing the RegionOp.
- */
 namespace tvm {
 namespace tl {
 
@@ -80,6 +42,12 @@ class RegionOpNode : public TileOperatorNode {
   Array<Range> ranges_;
   int access_mask_;
 
+  /*!
+   * access_mask_ encodes the intended access type when the region is used as
+   * an argument to tile operators: 1=read, 2=write, 3=read-write. The mask is
+   * transport metadata only and does not affect lowering.
+   */
+
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.RegionOp", RegionOpNode,
                                     TileOperatorNode);
 
@@ -107,8 +75,15 @@ class RegionOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(RegionOp, TileOperator,
                                              RegionOpNode);
-  TVM_DLL RegionOp(Array<PrimExpr> args, BufferMap vmap);
-
+  /*!
+   * Build a RegionOp from call arguments:
+   * - args[0]: BufferLoad whose indices are per-axis minima.
+   * - args[1]: Integer access mask (1=r, 2=w, 3=rw).
+   * - args[2 + i]: Extent of axis i (supports dynamic PrimExpr).
+   */
+  TVM_DLL
+  RegionOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index bb63c8dc0..2b17b7b54 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -15,16 +15,19 @@ using runtime::DataType;
 
 struct TCGEN5MMAMeta {
   int atom_m, atom_n, atom_k;
+  bool enable_ws, enable_2cta;
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
 GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
 #define FAIL                                                                   \
-  return { false, TCGEN5MMAMeta{0, 0, 0} }
-#define SUCCESS(atom_m, atom_n, atom_k)                                        \
   return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
+    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
+  }
+#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
+  return {                                                                     \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
   }
   std::vector<int> ws_valid_atom_ns = {256, 128, 64};
   if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
@@ -34,39 +37,83 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     if (M % 128 == 0) {
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16);
+          SUCCESS(128, atom_n, 16, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16);
+          SUCCESS(64, atom_n, 16, true, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16);
+          SUCCESS(32, atom_n, 16, true, false);
       FAIL;
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
-             (c_dtype.is_float() && c_dtype.bits() == 32)) {
+  } else if ((ab_dtype.is_float8() || ab_dtype.is_float6_e2m3fn() ||
+              ab_dtype.is_float6_e3m2fn() || ab_dtype.is_float4_e2m1fn()) &&
+             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
+              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
       FAIL;
     if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32);
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 64 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 32 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(32, atom_n, 32, true, false);
+      FAIL;
+    } else {
+      FAIL;
+    }
+  } else if ((ab_dtype.is_int() || ab_dtype.is_uint()) &&
+             ab_dtype.bits() == 8 && c_dtype.is_int() && c_dtype.bits() == 32) {
+    if (K % 32 != 0)
+      FAIL;
+    if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 32; atom_n -= 32)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= (atom_n > 32 ? 16 : 8))
+        // steps of 16 after N > 32
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32);
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= (atom_n > 32 ? 16 : 8))
+        // steps of 16 after N > 32
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32);
+          SUCCESS(32, atom_n, 32, true, false);
       FAIL;
     } else {
       FAIL;
@@ -100,6 +147,10 @@ inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
       return static_cast<uint32_t>(0);
     } else if (dtype.is_float8_e5m2fnuz() || dtype.is_float8_e5m2()) {
       return static_cast<uint32_t>(1);
+    } else if (dtype.is_int() && dtype.bits() == 8) {
+      return static_cast<uint32_t>(1);
+    } else if (dtype.is_uint() && dtype.bits() == 8) {
+      return static_cast<uint32_t>(0);
     }
     LOG(FATAL) << "Unsupported dtype for TCGEN5MMA descriptor: " << dtype;
     return 0u;
@@ -113,7 +164,7 @@ inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
     c_format = 0;
   } else if (c_dtype.is_float()) {
     c_format = 1;
-  } else if (c_dtype.is_int()) {
+  } else if (c_dtype.is_int() && c_dtype.bits() == 32) {
     c_format = 2;
   } else {
     LOG(FATAL) << "Unsupported accumulator dtype for TCGEN5MMA descriptor: "
diff --git a/src/op/utils.cc b/src/op/utils.cc
new file mode 100644
index 000000000..7f8c3c7c6
--- /dev/null
+++ b/src/op/utils.cc
@@ -0,0 +1,171 @@
+/*!
+ * \file tl/op/utils.cc
+ * \brief Common utilities implementation for TL ops.
+ */
+
+#include "utils.h"
+#include "tvm/tir/expr.h"
+
+#include <tvm/tir/builtin.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+bool IsBufferLikeExpr(const PrimExpr &expr) {
+  if (expr.as<BufferLoadNode>() || expr.as<BufferRegionNode>()) {
+    return true;
+  }
+  if (const auto *call = expr.as<CallNode>()) {
+    return (call->op.same_as(RegionOp::Get()));
+  }
+  return false;
+}
+
+BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
+  // Case 1: Already a BufferRegion
+  if (arg->IsInstance<BufferRegionNode>()) {
+    return Downcast<BufferRegion>(arg);
+  }
+
+  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
+  // extent=1)
+  if (const auto *load = arg.as<BufferLoadNode>()) {
+    Array<Range> ranges;
+    for (const PrimExpr &index : load->indices) {
+      if (const auto *ramp = index.as<RampNode>()) {
+        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
+        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
+            << "Only stride-1 Ramp is supported in region conversion";
+        ICHECK(ramp->lanes.as<IntImmNode>())
+            << "Scalable vector lanes not supported in region conversion";
+        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+      } else {
+        ranges.push_back(Range::FromMinExtent(index, 1));
+      }
+    }
+    return BufferRegion(load->buffer, ranges);
+  }
+
+  // Case 3: tl.region(...) — reconstruct via RegionOp (bridge)
+  if (const auto *call = arg.as<CallNode>()) {
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args);
+      return BufferRegion(region->GetBuffer(), region->GetRanges());
+    }
+    LOG(FATAL) << "Unsupported argument for BufferRegion (expect "
+                  "BufferLoad/BufferRegion/tl.region): "
+               << arg;
+  }
+
+  LOG(FATAL) << "Unsupported argument for BufferRegion: " << arg;
+  throw; // Unreachable
+}
+
+PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
+                                 bool require_2d) {
+  Buffer buf = region->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+  if (require_2d) {
+    ICHECK(ndim >= 2) << "Expect buffers with at least 2 dims";
+  }
+
+  PrimExpr offset, extent;
+  if (ndim == 1) {
+    // 1D: straightforward
+    auto axis = region->region[0];
+    offset = axis->min;
+    extent = axis->extent;
+  } else {
+    // Compute row-major strides
+    std::vector<PrimExpr> strides(ndim);
+    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
+    PrimExpr cur = one;
+    for (int i = ndim - 1; i >= 0; --i) {
+      strides[i] = cur;
+      cur = cur * buf->shape[i];
+    }
+    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
+    offset = make_const(buf->shape[0].dtype(), 0);
+    for (int i = 0; i < ndim - 2; ++i) {
+      offset = offset + region->region[i]->min * strides[i];
+    }
+    // Extent: last two extents product (elements)
+    extent =
+        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+  }
+
+  // ptype and return handle
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
+// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
+int to_CUtensorMapDataType(DataType dtype) {
+  CUtensorMapDataType tp;
+  if (dtype.is_float()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else if (dtype.is_bfloat16()) {
+    tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
+  } else if (dtype.is_float8()) {
+    tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+  } else if (dtype.is_int()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_INT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_INT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else if (dtype.is_uint()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else {
+    ICHECK(0) << dtype;
+  }
+  return static_cast<int>(tp);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/utils.h b/src/op/utils.h
new file mode 100644
index 000000000..9fdb3b4af
--- /dev/null
+++ b/src/op/utils.h
@@ -0,0 +1,80 @@
+/*!
+ * \file tl/op/utils.h
+ * \brief Common utilities for TL ops.
+ */
+
+#ifndef TVM_TL_OP_UTILS_H_
+#define TVM_TL_OP_UTILS_H_
+
+#include "../target/stubs/cuda.h"
+#include "./operator.h"
+#include "region.h"
+#include "tvm/runtime/base.h"
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
+TVM_DLL int to_CUtensorMapDataType(DataType dtype);
+
+// Reverses an array (used for row-major/column-major layout conversion).
+template <typename T> Array<T> ReverseArray(Array<T> array) {
+  return Array<T>{array.rbegin(), array.rend()};
+}
+
+// Check if an PrimExpr is a buffer-like (BufferRegion/BufferLoad/tl.region)
+// expression.
+TVM_DLL bool IsBufferLikeExpr(const PrimExpr &expr);
+
+// Normalize an argument (BufferRegion/BufferLoad/tl.region)
+// to BufferRegion so ops can uniformly consume regions.
+// Note: tvm_access_ptr is no longer supported here.
+TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
+
+// Build a tvm_access_ptr(handle) from a BufferRegion.
+// - If `require_2d` is true, checks buffer ndim >= 2.
+// - For 1D regions (when allowed), offset=min, extent=extent.
+// - For ndim >= 2, offset sums all but last two dims using row-major strides,
+//   extent is product of the last two extents.
+TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
+                                         int rw_mask, bool require_2d = false);
+
+// Check if a buffer is a fragment buffer (scope == "local.fragment")
+inline bool IsFragmentBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "local.fragment";
+}
+
+inline bool IsSharedBuffer(const Buffer &buffer, bool allow_dynamic = true) {
+  if (allow_dynamic) {
+    return buffer.defined() &&
+           (buffer.scope() == "shared" || buffer.scope() == "shared.dyn");
+  } else {
+    return buffer.defined() && buffer.scope() == "shared";
+  }
+}
+
+inline bool IsGlobalBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "global";
+}
+
+inline bool IsLocalBuffer(const Buffer &buffer, bool allow_var = false) {
+  if (allow_var) {
+    return buffer.defined() &&
+           (buffer.scope() == "local" || buffer.scope() == "local.var");
+  } else {
+    return buffer.defined() && buffer.scope() == "local";
+  }
+}
+
+inline bool IsLocalVarBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "local.var";
+}
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_UTILS_H_
diff --git a/src/runtime/error_helpers.cc b/src/runtime/error_helpers.cc
new file mode 100644
index 000000000..903f8b1d9
--- /dev/null
+++ b/src/runtime/error_helpers.cc
@@ -0,0 +1,222 @@
+/*
+ * Helper functions for nicer runtime error messages.
+ */
+#include "error_helpers.h"
+
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/device_api.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace tl {
+
+// Return non-zero so that tvm_call_packed sites treat it as failure and return
+// -1.
+static int DTypeMismatch(const tvm::ffi::String &kernel_name,
+                         const tvm::ffi::String &buffer_name,
+                         int64_t actual_code, int64_t actual_bits,
+                         int64_t actual_lanes, int64_t expect_code,
+                         int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "kernel " << std::string(kernel_name) << " input "
+     << std::string(buffer_name) << " dtype expected " << expect << ", but got "
+     << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Variant without names, to avoid passing extra raw strings through packed
+// args.
+static int DTypeMismatchNoNames(int64_t actual_code, int64_t actual_bits,
+                                int64_t actual_lanes, int64_t expect_code,
+                                int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "dtype mismatch: expected " << expect << ", but got " << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Register packed versions, following the design in runtime.cc
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+
+  // Packed: __tvm_error_dtype_mismatch(kernel_name, buffer_name,
+  //                                    actual_code, actual_bits, actual_lanes,
+  //                                    expect_code, expect_bits, expect_lanes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_dtype_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 8) << "Expected 8 args: kernel, buffer, "
+                                    "actual_code, actual_bits, actual_lanes, "
+                                 << "expect_code, expect_bits, expect_lanes";
+
+        auto kernel_name = args[0].cast<tvm::ffi::String>();
+        auto buffer_name = args[1].cast<tvm::ffi::String>();
+        int64_t actual_code = args[2].cast<int64_t>();
+        int64_t actual_bits = args[3].cast<int64_t>();
+        int64_t actual_lanes = args[4].cast<int64_t>();
+        int64_t expect_code = args[5].cast<int64_t>();
+        int64_t expect_bits = args[6].cast<int64_t>();
+        int64_t expect_lanes = args[7].cast<int64_t>();
+
+        // Reuse the helper to format the message
+        (void)DTypeMismatch(kernel_name, buffer_name, actual_code, actual_bits,
+                            actual_lanes, expect_code, expect_bits,
+                            expect_lanes);
+        // Provide a return value for completeness, then signal the error
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_ndim_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_ndim_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " ndim expected " << expect << ", but got "
+           << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_byte_offset_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_byte_offset_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " byte_offset expected " << expect
+           << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_device_type_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_device_type_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        const char *expect_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(expect));
+        const char *got_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(got));
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " device_type expected " << expect_str
+           << ", but got " << got_str;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_null_ptr,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3)
+            << "__tvm_error_null_ptr(kernel, buffer, field)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " expected non-NULL, but got NULL";
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_expect_eq,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 5)
+            << "__tvm_error_expect_eq(kernel, buffer, field, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        int64_t expect = args[3].cast<int64_t>();
+        int64_t got = args[4].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field) << " expected "
+           << expect << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String [, reason:String]
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_constraint_violation,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3 || args.size() == 4)
+            << "__tvm_error_constraint_violation(kernel, buffer, field[, "
+               "reason])";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::string reason;
+        if (args.size() == 4) {
+          reason = args[3].cast<tvm::ffi::String>();
+        }
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " constraint not satisfied";
+        if (!reason.empty()) {
+          os << ": " << reason;
+        }
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // Legacy typed registrations for backward compatibility
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch",
+                        &tvm::tl::DTypeMismatch);
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch2",
+                        &tvm::tl::DTypeMismatchNoNames);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/runtime/error_helpers.h b/src/runtime/error_helpers.h
new file mode 100644
index 000000000..6620d837e
--- /dev/null
+++ b/src/runtime/error_helpers.h
@@ -0,0 +1,27 @@
+/*!
+ * \file tl/runtime/error_helpers.h
+ * \brief Error helper FFI names for TileLang runtime.
+ */
+
+#ifndef TVM_TL_RUNTIME_ERROR_HELPERS_H_
+#define TVM_TL_RUNTIME_ERROR_HELPERS_H_
+
+namespace tvm {
+namespace tl {
+
+// Error helper packed functions
+constexpr const char *tvm_error_dtype_mismatch = "__tvm_error_dtype_mismatch";
+constexpr const char *tvm_error_ndim_mismatch = "__tvm_error_ndim_mismatch";
+constexpr const char *tvm_error_byte_offset_mismatch =
+    "__tvm_error_byte_offset_mismatch";
+constexpr const char *tvm_error_device_type_mismatch =
+    "__tvm_error_device_type_mismatch";
+constexpr const char *tvm_error_null_ptr = "__tvm_error_null_ptr";
+constexpr const char *tvm_error_expect_eq = "__tvm_error_expect_eq";
+constexpr const char *tvm_error_constraint_violation =
+    "__tvm_error_constraint_violation";
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_RUNTIME_ERROR_HELPERS_H_
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index a00786e25..7bb6f4a81 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -6,13 +6,19 @@
 
 #include "runtime.h"
 
-#include "../target/cuda.h"
+#include "../target/stubs/cuda.h"
 #include <tvm/ffi/function.h>
 #include <tvm/node/node.h>
 
 namespace tvm {
 namespace tl {
 
+#if 1
+// Thread-local storage for restoring the L2 persisting cache limit
+static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
+static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
+#endif
+
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -91,19 +97,21 @@ struct TensorMapArgs {
 // set device api
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
-                                                                Any *ret) {
-    TensorMapArgs T = TensorMapArgs::Extract(args);
-    CUresult result = cuTensorMapEncodeTiled(
-        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
-        T.l2Promotion, T.oobFill);
-    if (result != CUDA_SUCCESS) {
-      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
-                << T.ToDebugString();
-    }
-    *ret = static_cast<int>(result);
-  });
+  // Register using the canonical names defined in runtime.h
+  refl::GlobalDef().def_packed(
+      tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << '\n'
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
 }
 
 struct TensorMapIm2ColArgs {
@@ -183,7 +191,7 @@ struct TensorMapIm2ColArgs {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
-      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+      tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
@@ -201,5 +209,141 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
+//
+// CUDA L2 Persisting Cache Access Policy Window helpers.
+// Exposed as TVM FFI packed functions similar to TMA initialization.
+//
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  // Set stream access policy window and adjust persisting L2 cache size
+  // Args:
+  //  [0]: void* base_ptr (required)
+  //  [1]: int64 num_bytes (required)
+  //  [2]: float hit_ratio (optional, default 0.8)
+  //  [3]: void* stream (optional, default 0 => default stream)
+  //  [4]: int64 l2_limit_bytes (optional, default = num_bytes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_set_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";
+
+        void *base_ptr = args[0].cast<void *>();
+        size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
+        float hit_ratio = 0.8f;
+        if (args.size() >= 3) {
+          // Accept double/float
+          hit_ratio = static_cast<float>(args[2].cast<double>());
+        }
+        CUstream stream = nullptr;
+        if (args.size() >= 4) {
+          stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
+        }
+        size_t l2_limit_bytes = num_bytes;
+        if (args.size() >= 5) {
+          l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
+        }
+
+        // Clamp requested limit to device capability
+        CUdevice device;
+        CUresult result = cuCtxGetDevice(&device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current CUDA device: " << result;
+        }
+        int max_persisting = 0;
+        result = cuDeviceGetAttribute(
+            &max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
+            device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
+                    << result;
+        }
+        if (max_persisting > 0 &&
+            l2_limit_bytes > static_cast<size_t>(max_persisting)) {
+          l2_limit_bytes = static_cast<size_t>(max_persisting);
+        }
+
+        // Save current limit to restore later
+        size_t init_persisting_l2_cache_size = 0;
+        result = cuCtxGetLimit(&init_persisting_l2_cache_size,
+                               CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
+                    << result;
+        }
+        __tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
+        __tl_prev_persisting_l2_cache_saved = true;
+
+        // Set new limit
+        result =
+            cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set persisting L2 cache size limit: "
+                    << result;
+        }
+
+        // Apply access policy window to stream
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
+        stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
+        stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
+        stream_attribute.accessPolicyWindow.hitProp =
+            CU_ACCESS_PROPERTY_PERSISTING;
+        stream_attribute.accessPolicyWindow.missProp =
+            CU_ACCESS_PROPERTY_STREAMING;
+
+        result = cuStreamSetAttribute(stream,
+                                      CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+                                      &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set stream access policy window: " << result;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+
+  // Reset stream access policy window and restore the previous L2 cache size
+  // Args:
+  //  [0]: void* stream (optional, default 0)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_reset_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        CUstream stream = nullptr;
+        if (args.size() >= 1) {
+          stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
+        }
+
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        // num_bytes = 0 disables the access policy window on the stream
+        stream_attribute.accessPolicyWindow.num_bytes = 0;
+
+        CUresult result = cuStreamSetAttribute(
+            stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+            &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset stream access policy window: "
+                    << result;
+        }
+
+        result = cuCtxResetPersistingL2Cache();
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
+        }
+
+        if (__tl_prev_persisting_l2_cache_saved) {
+          result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
+                                 __tl_prev_persisting_l2_cache_size);
+          if (result != CUDA_SUCCESS) {
+            LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
+                      << result;
+          }
+          __tl_prev_persisting_l2_cache_saved = false;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index fb9dfcfdd..4b389fc03 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
 constexpr const char *tvm_tensormap_create_im2col =
     "__tvm_tensormap_create_im2col";
 #endif // (CUDA_MAJOR_VERSION >= 12)
+
+// CUDA stream access policy window helpers
+constexpr const char *tvm_cuda_stream_set_access_policy_window =
+    "__tvm_cuda_stream_set_access_policy_window";
+constexpr const char *tvm_cuda_stream_reset_access_policy_window =
+    "__tvm_cuda_stream_reset_access_policy_window";
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_RUNTIME_RUNTIME_H_
\ No newline at end of file
+#endif //  TVM_TL_RUNTIME_RUNTIME_H_
diff --git a/src/support/ffi_aliases.h b/src/support/ffi_aliases.h
index cbc6fb027..7dbe0b395 100644
--- a/src/support/ffi_aliases.h
+++ b/src/support/ffi_aliases.h
@@ -3,6 +3,7 @@
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/function.h>
 #include <tvm/ffi/memory.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
new file mode 100644
index 000000000..cb769fbe6
--- /dev/null
+++ b/src/target/codegen_c_host.cc
@@ -0,0 +1,563 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.cc
+ */
+#include "codegen_c_host.h"
+
+#include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/extra/module.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/codegen.h>
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// For escaping strings embedded into generated C sources
+#include "support/str_escape.h"
+
+namespace tvm {
+namespace tl {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name_ = name_supply_->FreshName(tvm::ffi::symbol::tvm_ffi_library_ctx);
+}
+
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts,
+                        bool emit_fwd_func_decl, std::string target_str,
+                        const std::unordered_set<std::string> &devices) {
+  emit_asserts_ = emit_asserts;
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  declared_globals_.clear();
+  decl_stream << "// tilelang target: " << target_str << "\n";
+  decl_stream << "#define TVM_EXPORTS\n";
+  decl_stream << "#include \"tvm/runtime/base.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/c_api.h\"\n";
+  decl_stream << "#include <math.h>\n";
+  // snprintf for richer assert messages with actual values
+  decl_stream << "#include <stdio.h>\n";
+  decl_stream << "#include <stdbool.h>\n";
+
+  decl_stream << "#ifdef __OBJC__\n";
+  decl_stream << "#include \"tvm/runtime/device_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/function.h\"\n";
+
+  decl_stream << "#include <Metal/Metal.h>\n";
+  decl_stream << "#include <Foundation/Foundation.h>\n";
+
+  decl_stream << "#include <torch/mps.h>\n";
+  decl_stream << "#endif\n";
+
+  CodeGenCHost::InitGlobalContext();
+  tvm::codegen::CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::InitGlobalContext() {
+  decl_stream << "void* " << tvm::ffi::symbol::tvm_ffi_library_ctx
+              << " = NULL;\n";
+}
+
+void CodeGenCHost::DefineModuleName() {
+  decl_stream << "void* " << module_name_ << " = NULL;\n";
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func) {
+  return AddFunction(gvar, func, /*emit_fwd_func_decl=*/false);
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func,
+                               bool emit_fwd_func_decl) {
+  auto global_symbol =
+      func->GetAttr<tvm::ffi::String>(tvm::attr::kGlobalSymbol);
+  if (global_symbol) {
+    function_names_.push_back(global_symbol.value());
+  }
+
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  tvm::codegen::CodeGenC::AddFunction(gvar, func);
+  if (func->HasNonzeroAttr(tvm::tir::attr::kIsEntryFunc) && !has_main_func_) {
+    ICHECK(global_symbol.has_value())
+        << "CodeGenCHost: The entry func must have the global_symbol "
+           "attribute, "
+        << "but function " << gvar << " only has attributes " << func->attrs;
+    function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
+    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+    PrintFuncPrefix(stream);
+    PrintType(func->ret_type, stream);
+    stream << " " << tvm::ffi::symbol::tvm_ffi_main
+           << "(void* self, void* args,int num_args, void* result) {\n";
+    stream << "  return " << static_cast<std::string>(global_symbol.value())
+           << "(self, args, num_args, result);\n";
+    stream << "}\n";
+    has_main_func_ = true;
+  }
+}
+
+void CodeGenCHost::GenerateForwardFunctionDeclarations(
+    tvm::ffi::String global_symbol, const tvm::ffi::Array<tvm::Type> &arg_types,
+    const tvm::Type &ret_type) {
+  if (!emit_fwd_func_decl_) {
+    return;
+  }
+  for (auto &func_already_defined : GetFunctionNames()) {
+    if (global_symbol == func_already_defined) {
+      return;
+    }
+  }
+  this->PrintFuncPrefix(fwd_decl_stream);
+  this->PrintType(ret_type, fwd_decl_stream);
+  fwd_decl_stream << " " << global_symbol << "(";
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    if (i > 0) {
+      fwd_decl_stream << ", ";
+    }
+    tvm::codegen::CodeGenSourceBase::PrintType(arg_types[i], fwd_decl_stream);
+  }
+  fwd_decl_stream << ");\n";
+}
+
+void CodeGenCHost::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
+  os << "#ifdef __cplusplus\n"
+     << "extern \"C\"\n"
+     << "#endif\n";
+}
+
+void CodeGenCHost::PrintType(tvm::DataType t, std::ostream &os) { // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
+    os << "void*";
+    return;
+  }
+  if (t.is_void()) {
+    os << "void";
+    return;
+  }
+  if (t == tvm::DataType::Bool()) {
+    os << "bool";
+    return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+    case 16:
+      os << "half";
+      break;
+    case 32:
+      os << "float";
+      break;
+    case 64:
+      os << "double";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  if (t.is_bfloat16()) {
+    os << "__bf16";
+    return;
+  }
+  if (t.is_int() || t.is_uint()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+    case 8:
+      os << "int8_t";
+      break;
+    case 16:
+      os << "int16_t";
+      break;
+    case 32:
+      os << "int32_t";
+      break;
+    case 64:
+      os << "int64_t";
+      break;
+    case 1:
+      os << "int32_t";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::BroadcastNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  int lanes = op->dtype.lanes();
+  os << "((";
+  PrintType(op->dtype, os);
+  os << ")(";
+  for (int i = 0; i < lanes; ++i) {
+    if (i != 0)
+      os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(
+    const std::string &func_name, const std::string &packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name_ << ", \""
+               << func_name << "\""
+               << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *func_name = op->args[0].as<StringImmNode>();
+  ICHECK(func_name != nullptr)
+      << "tvm_call_[c]packed_lowered expects first argument as function name";
+  int64_t begin = op->args[2].as<IntImmNode>()->value;
+  int64_t end = op->args[3].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+
+  std::string packed_func_name;
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    packed_func_name = GetPackedName(op);
+    this->PrintGetFuncFromBackend(func_name->value, packed_func_name);
+  } else {
+    // directly use the original symbol
+    ICHECK(op->op.same_as(builtin::tvm_call_cpacked_lowered()));
+    packed_func_name =
+        tvm::ffi::symbol::tvm_ffi_symbol_prefix + func_name->value;
+  }
+
+  std::string args_stack = PrintExpr(op->args[1]);
+  this->PrintIndent();
+  std::string result = name_supply_->FreshName("result");
+  if (is_in_metal_context) {
+    this->stream << "__block ";
+  }
+  this->stream << "TVMFFIAny " << result << ";\n";
+  this->PrintIndent();
+  // must make sure type_index is set to none
+  this->stream << result << ".type_index = kTVMFFINone;\n";
+  this->PrintIndent();
+  this->stream << result << ".zero_padding = 0;\n";
+  this->PrintIndent();
+  this->stream << result << ".v_int64 = 0;\n";
+
+  int metal_scope;
+  std::string metal_result;
+  if (is_in_metal_context) {
+    metal_result = name_supply_->FreshName("metal_ret");
+    this->PrintLine("__block int ", metal_result, " = 0;");
+    this->PrintLine("auto serialQueue = torch::mps::get_dispatch_queue();");
+    this->PrintLine("dispatch_sync(serialQueue, ^() {");
+    metal_scope = this->BeginScope();
+
+    this->PrintLine("const id<MTLCommandBuffer> commandBuffer = "
+                    "torch::mps::get_command_buffer();");
+    this->PrintLine(
+        "const auto f = tvm::ffi::Function::GetGlobal(\"metal.SetStream\");");
+    this->PrintLine("(*f)(static_cast<TVMStreamHandle>(commandBuffer));");
+  }
+
+  this->PrintIndent();
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->stream << "if (TVMFFIFunctionCall(" << packed_func_name << ", ";
+  } else {
+    this->stream << "if (" << packed_func_name << "(NULL, ";
+  }
+  this->stream << "(TVMFFIAny*) " << args_stack << ", " << num_args << ", "
+               << "&" << result << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  if (is_in_metal_context) {
+    this->PrintLine(metal_result, " = -1;");
+  } else {
+    this->PrintLine("return -1;");
+  }
+  this->EndScope(func_call_scope);
+
+  this->PrintLine("}");
+
+  if (is_in_metal_context) {
+    this->EndScope(metal_scope);
+    this->PrintLine("});");
+    this->PrintLine("if (", metal_result, " != 0) return ", metal_result, ";");
+  }
+}
+
+std::string CodeGenCHost::GetPackedName(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr)
+      << "tvm_call_packed_lowered expects first argument as function name";
+  std::string func_name = s->value;
+  std::string packed_func_name = func_name + "_packed";
+  std::string unique_name;
+  auto it = declared_globals_.find(packed_func_name);
+  if (it != declared_globals_.end()) {
+    unique_name = it->second;
+  } else {
+    unique_name = name_supply_->FreshName(packed_func_name);
+    declared_globals_[packed_func_name] = unique_name;
+    decl_stream << "static void* " << unique_name << " = NULL;\n";
+  }
+  return unique_name;
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  using namespace tvm::tir;
+  if (op->op.same_as(builtin::tvm_stack_alloca())) {
+    std::string stack_name = name_supply_->FreshName("stack");
+    const std::string &type = op->args[0].as<StringImmNode>()->value;
+    const IntImmNode *num = op->args[1].as<IntImmNode>();
+    ICHECK(num != nullptr);
+    static_assert(alignof(TVMFFIAny) % alignof(DLTensor) == 0, "invariant");
+    size_t unit = sizeof(TVMFFIAny);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(ffi::Shape::index_type) + unit - 1) / unit;
+    } else if (type == "tvm_ffi_any") {
+      size = (num->value * sizeof(TVMFFIAny) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(DLTensor) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    tvm::codegen::CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
+  if (emit_asserts_) {
+    std::string cond = PrintExpr(op->condition);
+    PrintIndent();
+    stream << "if (!(" << cond << ")) {\n";
+    int assert_if_scope = this->BeginScope();
+    {
+      // Prepare the base error message: allow StringImm or general PrimExpr
+      const auto *msg_node = op->message.as<tvm::tir::StringImmNode>();
+      bool msg_is_literal = (msg_node != nullptr);
+      std::string esc_msg;
+      std::string msg_expr;
+      if (msg_is_literal) {
+        const std::string &raw_msg = msg_node->value;
+        esc_msg = tvm::support::StrEscape(
+            raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
+            /*escape_whitespace_special_chars=*/true);
+      } else {
+        msg_expr = PrintExpr(op->message);
+      }
+
+      // Only print expected/got values for equality when message is StringImm
+      if (msg_is_literal) {
+        if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
+          std::string lhs = PrintExpr(eq->a);
+          std::string rhs = PrintExpr(eq->b);
+          PrintIndent();
+          stream << "char __tvm_assert_msg_buf[512];\n";
+          PrintIndent();
+          stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
+                    "got: %lld\", \""
+                 << esc_msg << "\", (long long)(" << lhs << "), (long long)("
+                 << rhs << "));\n";
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
+                    "__tvm_assert_msg_buf);\n";
+        } else {
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \""
+                 << esc_msg << "\");\n";
+        }
+      } else {
+        PrintIndent();
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", " << msg_expr
+               << ");\n";
+      }
+    }
+    PrintIndent();
+    stream << "return -1;\n";
+    this->EndScope(assert_if_scope);
+    PrintIndent();
+    stream << "}\n";
+  }
+  this->PrintStmt(op->body);
+}
+
+void CodeGenCHost::VisitStmt_(const tvm::tir::AttrStmtNode *op) {
+  bool enter_metal_ctx = op->attr_key == "metal_context";
+  if (enter_metal_ctx) {
+    ICHECK(!is_in_metal_context) << "Nested metal context";
+    is_in_metal_context = true;
+  }
+  tvm::codegen::CodeGenC::VisitStmt_(op);
+  if (enter_metal_ctx) {
+    is_in_metal_context = false;
+  }
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MinNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, "<", os);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MaxNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, ">", os);
+}
+
+template <typename T>
+inline void CodeGenCHost::PrintTernaryCondExpr(const T *op, const char *compare,
+                                               std::ostream &os) { // NOLINT(*)
+  std::ostringstream temp_a;
+  VisitExpr(op->a, temp_a);
+  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::ostringstream temp_b;
+  VisitExpr(op->b, temp_b);
+  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+
+  os << "((" << a_id << ") " << compare << " (" << b_id << ") "
+     << "? (" << a_id << ") : (" << b_id << "))";
+}
+
+} // namespace tl
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using tvm::codegen::CodeGenSourceBase;
+using tvm::codegen::CSourceModuleCreate;
+using tvm::ffi::Array;
+using tvm::ffi::Map;
+using tvm::ffi::Module;
+using tvm::ffi::String;
+
+// Build function that mirrors TVM's host C codegen, registered under a
+// TileLang-specific name.
+::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
+                                      ::tvm::Target target) {
+  bool output_ssa = false;
+  bool emit_asserts = true;
+  bool emit_fwd_func_decl = true;
+
+  std::unordered_set<std::string> devices;
+  if (mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+          "device_contexts") != nullptr) {
+    ::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String> device_contexts =
+        mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+               "device_contexts")
+            .value();
+    for (auto const &context : device_contexts) {
+      devices.insert(context.second.data());
+    }
+  }
+
+  CodeGenCHost cg;
+  cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
+  cg.SetConstantsByteAlignment(
+      target->GetAttr<::tvm::Integer>("constants-byte-alignment").value_or(16));
+
+  auto is_aot_executor_fn = [](::tvm::tir::PrimFunc const &func) -> bool {
+    return func->GetAttr<::tvm::Bool>("runner_function", ::tvm::Bool(false))
+        .value();
+  };
+
+  std::vector<std::pair<::tvm::GlobalVar, ::tvm::tir::PrimFunc>> funcs;
+  for (auto [gvar, base_func] : mod->functions) {
+    ICHECK(base_func->IsInstance<::tvm::tir::PrimFuncNode>())
+        << "CodegenCHost: Can only take PrimFunc";
+    auto prim_func = ::tvm::Downcast<::tvm::tir::PrimFunc>(base_func);
+    funcs.push_back({gvar, prim_func});
+  }
+
+  auto sort_key = [&is_aot_executor_fn](const auto &kv) {
+    return std::tuple{is_aot_executor_fn(kv.second), kv.first->name_hint};
+  };
+  std::sort(funcs.begin(), funcs.end(),
+            [&sort_key](const auto &kv_a, const auto &kv_b) {
+              return sort_key(kv_a) < sort_key(kv_b);
+            });
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.DeclareFunction(gvar, prim_func);
+  }
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.AddFunction(gvar, prim_func, emit_fwd_func_decl);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_c_host_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return ::tvm::codegen::CSourceModuleCreate(code, "c", cg.GetFunctionNames());
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangCHost);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/target/codegen_c_host.h b/src/target/codegen_c_host.h
new file mode 100644
index 000000000..1644246d7
--- /dev/null
+++ b/src/target/codegen_c_host.h
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.h
+ * \brief Generate C host code (TileLang copy).
+ */
+#ifndef TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#define TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "target/source/codegen_c.h"
+#include "tvm/target/codegen.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+// TileLang copy of TVM's CodeGenCHost, under the tl namespace.
+// Inherits from tvm::codegen::CodeGenC.
+class CodeGenCHost : public tvm::codegen::CodeGenC {
+public:
+  CodeGenCHost();
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
+            std::string target_str,
+            const std::unordered_set<std::string> &devices);
+
+  void InitGlobalContext();
+
+  void AddFunction(const tvm::GlobalVar &gvar,
+                   const tvm::tir::PrimFunc &f) override;
+  void AddFunction(const tvm::GlobalVar &gvar, const tvm::tir::PrimFunc &f,
+                   bool emit_fwd_func_decl);
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a
+   * deterministic order. This helps with debugging.
+   *
+   * \param functions A vector of unordered range of current module.
+   */
+  void AddFunctionsOrdered(
+      std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> functions);
+  void DefineModuleName();
+
+  using tvm::codegen::CodeGenC::PrintType;
+  void PrintType(tvm::DataType t, std::ostream &os) final; // NOLINT(*)
+  void PrintFuncPrefix(std::ostream &os) final;            // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const tvm::tir::BroadcastNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::CallNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  // overload min and max to use the ternary operator, so we don't rely on the
+  // standard library implementations
+  void VisitExpr_(const tvm::tir::MinNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::MaxNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+
+  void VisitStmt_(const tvm::tir::AssertStmtNode *op) final; // NOLINT(*)
+
+  void VisitStmt_(const tvm::tir::AttrStmtNode *op) final; // NOLINT(*)
+
+  void GenerateForwardFunctionDeclarations(
+      tvm::ffi::String global_symbol,
+      const tvm::ffi::Array<tvm::Type> &arg_types,
+      const tvm::Type &ret_type) override;
+  tvm::ffi::Array<tvm::ffi::String> GetFunctionNames() {
+    return function_names_;
+  }
+
+private:
+  std::string module_name_;
+  /* \brief mapping global packed func to the unique name */
+  std::unordered_map<std::string, std::string> declared_globals_;
+  /* \brief names of the functions declared in this module */
+  tvm::ffi::Array<tvm::ffi::String> function_names_;
+  /*! \brief whether to emit asserts in the resulting C code */
+  bool emit_asserts_;
+  /*! \brief whether to emit forwared function declarations in the resulting C
+   * code */
+  bool emit_fwd_func_decl_;
+  /*! \brief whether to generate the entry function if encountered */
+  bool has_main_func_ = false;
+
+  bool is_in_metal_context = false;
+
+  std::string GetPackedName(const tvm::tir::CallNode *op);
+  void PrintGetFuncFromBackend(const std::string &func_name,
+                               const std::string &packed_func_name);
+  void PrintCallPacked(const tvm::tir::CallNode *op);
+  /*!
+   * \brief Print ternary conditional operator implementing binary `op`
+   * Forces the operands to be in SSA form.
+   * \param op binary operator being expressed
+   * \param compare string representation of comparison operator
+   * \param os stream reference to print into
+   */
+  template <typename T>
+  inline void PrintTernaryCondExpr(const T *op, const char *compare,
+                                   std::ostream &os); // NOLINT(*)
+
+  template <typename... Args> void PrintLine(Args &&...args) {
+    this->PrintIndent();
+    (this->stream << ... << args) << '\n';
+  }
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index 9accf5303..4f736bb06 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -29,6 +29,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "../op/builtin.h"
 #include "../support/ffi_aliases.h"
 #include "support/str_escape.h"
 #include "target/build_common.h"
@@ -203,12 +204,12 @@ void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
   this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
+               << "(TVMFFIAny*) stack_value"
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -228,13 +229,13 @@ void CodeGenTileLangCPP::PrintFuncCallC(
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
 
   this->stream << "if (" << packed_func_name << "( "
-               << "(TVMValue*) stack_value "
+               << "(TVMFFIAny*) stack_value "
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -260,6 +261,12 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -294,7 +301,7 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 6b5f5063c..d3de69803 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -14,7 +14,9 @@
 #include <vector>
 
 #include "../op/builtin.h"
+#include "../transform/common/attr.h"
 #include "./ptx.h"
+#include "./utils.h"
 #include "arith/pattern_match.h"
 
 namespace tvm {
@@ -107,7 +109,7 @@ struct CUDAIEEEMath {
   }
 };
 
-static std::string GetFP8Type(DataType type) {
+static std::string GetTileLangFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -128,19 +130,19 @@ static std::string GetFP8Type(DataType type) {
         << "Only support scalar and vector types of width (2, 4, 8, 16, 32) "
            "for FP8";
   }
-  if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
-      type.is_float8_e4m3()) {
+  if (type.is_float8_e4m3() || type.is_float8_e4m3fn()) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
-             type.is_float8_e5m2()) {
+  } else if (type.is_float8_e5m2()) {
     stream << "fp8_e5" << vec << "_t";
+  } else if (type.is_float8_e8m0fnu()) {
+    stream << "fp8_e8" << vec << "_t";
   } else {
     LOG(FATAL) << "Unsupported FP8 type in CUDA codegen but got " << type;
   }
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetTileLangFP6Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -171,32 +173,37 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetTileLangFP4Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
   if (type.is_scalar()) {
     vec = "";
   } else if (lanes == 2) {
-    vec = "x2";
+    vec = "_2";
   } else if (lanes == 4) {
-    vec = "x4";
+    vec = "_4";
   } else if (lanes == 8) {
-    vec = "x8";
+    vec = "_8";
   } else if (lanes == 16) {
-    vec = "x16";
+    vec = "_16";
+  } else if (lanes == 32) {
+    vec = "_32";
+  } else if (lanes == 64) {
+    vec = "_64";
   } else {
-    LOG(FATAL)
-        << "Only support scalar and vector types of width (2, 4) for FP4";
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16, "
+                  "32, 64) for FP4";
   }
-  stream << "__nv_fp4";
+
   std::string suffix;
   if (type.code() == DataType::kFloat4_e2m1fn) {
-    suffix = "_e2m1";
+    suffix = "_e2";
   } else {
     LOG(FATAL) << "Unsupported FP4 type in CUDA codegen";
   }
-  stream << vec << suffix;
+
+  stream << "fp4" << suffix << vec << "_t";
   return stream.str();
 }
 
@@ -209,6 +216,188 @@ CodeGenTileLangCUDA::CodeGenTileLangCUDA() {
             runtime::symbol::tvm_global_barrier_state);
 }
 
+void CodeGenTileLangCUDA::ReserveKeywordsAsUnique_() {
+  CodeGenC::ReserveKeywordsAsUnique();
+  name_supply_->ReserveName("max");
+  name_supply_->ReserveName("min");
+  name_supply_->ReserveName("isfinite");
+  name_supply_->ReserveName("isinf");
+  name_supply_->ReserveName("isnan");
+
+  // skip single precision mathematical functions
+  name_supply_->ReserveName("acosf");
+  name_supply_->ReserveName("acoshf");
+  name_supply_->ReserveName("asinf");
+  name_supply_->ReserveName("asinhf");
+  name_supply_->ReserveName("atan2f");
+  name_supply_->ReserveName("atanf");
+  name_supply_->ReserveName("atanhf");
+  name_supply_->ReserveName("cbrtf");
+  name_supply_->ReserveName("ceilf");
+  name_supply_->ReserveName("copysignf");
+  name_supply_->ReserveName("cosf");
+  name_supply_->ReserveName("coshf");
+  name_supply_->ReserveName("cospif");
+  name_supply_->ReserveName("cyl_bessel_i0f");
+  name_supply_->ReserveName("cyl_bessel_i1f");
+  name_supply_->ReserveName("erfcf");
+  name_supply_->ReserveName("erfcinvf");
+  name_supply_->ReserveName("erfcxf");
+  name_supply_->ReserveName("erff");
+  name_supply_->ReserveName("erfinvf");
+  name_supply_->ReserveName("exp10f");
+  name_supply_->ReserveName("exp2f");
+  name_supply_->ReserveName("expf");
+  name_supply_->ReserveName("expm1f");
+  name_supply_->ReserveName("fabsf");
+  name_supply_->ReserveName("fdimf");
+  name_supply_->ReserveName("fdividef");
+  name_supply_->ReserveName("floorf");
+  name_supply_->ReserveName("fmaf");
+  name_supply_->ReserveName("fmaxf");
+  name_supply_->ReserveName("fminf");
+  name_supply_->ReserveName("fmodf");
+  name_supply_->ReserveName("frexpf");
+  name_supply_->ReserveName("hypotf");
+  name_supply_->ReserveName("ilogbf");
+  name_supply_->ReserveName("j0f");
+  name_supply_->ReserveName("j1f");
+  name_supply_->ReserveName("jnf");
+  name_supply_->ReserveName("ldexpf");
+  name_supply_->ReserveName("lgammaf");
+  name_supply_->ReserveName("llrintf");
+  name_supply_->ReserveName("llroundf");
+  name_supply_->ReserveName("log10f");
+  name_supply_->ReserveName("log1pf");
+  name_supply_->ReserveName("log2f");
+  name_supply_->ReserveName("logbf");
+  name_supply_->ReserveName("logf");
+  name_supply_->ReserveName("lrintf");
+  name_supply_->ReserveName("lroundf");
+  name_supply_->ReserveName("modff");
+  name_supply_->ReserveName("nanf");
+  name_supply_->ReserveName("nearbyintf");
+  name_supply_->ReserveName("nextafterf");
+  name_supply_->ReserveName("norm3df");
+  name_supply_->ReserveName("norm4df");
+  name_supply_->ReserveName("normcdff");
+  name_supply_->ReserveName("normcdfinvf");
+  name_supply_->ReserveName("normf");
+  name_supply_->ReserveName("powf");
+  name_supply_->ReserveName("rcbrtf");
+  name_supply_->ReserveName("remainderf");
+  name_supply_->ReserveName("remquof");
+  name_supply_->ReserveName("rhypotf");
+  name_supply_->ReserveName("rintf");
+  name_supply_->ReserveName("rnorm3df");
+  name_supply_->ReserveName("rnorm4df");
+  name_supply_->ReserveName("rnormf");
+  name_supply_->ReserveName("roundf");
+  name_supply_->ReserveName("rsqrtf");
+  name_supply_->ReserveName("scalblnf");
+  name_supply_->ReserveName("scalbnf");
+  name_supply_->ReserveName("signbit");
+  name_supply_->ReserveName("sincosf");
+  name_supply_->ReserveName("sincospif");
+  name_supply_->ReserveName("sinf");
+  name_supply_->ReserveName("sinhf");
+  name_supply_->ReserveName("sinpif");
+  name_supply_->ReserveName("sqrtf");
+  name_supply_->ReserveName("tanf");
+  name_supply_->ReserveName("tanhf");
+  name_supply_->ReserveName("tgammaf");
+  name_supply_->ReserveName("truncf");
+  name_supply_->ReserveName("y0f");
+  name_supply_->ReserveName("y1f");
+  name_supply_->ReserveName("ynf");
+
+  // skip double precision mathematical functions
+  name_supply_->ReserveName("acos");
+  name_supply_->ReserveName("acosh");
+  name_supply_->ReserveName("asin");
+  name_supply_->ReserveName("asinh");
+  name_supply_->ReserveName("atan");
+  name_supply_->ReserveName("atan2");
+  name_supply_->ReserveName("atanh");
+  name_supply_->ReserveName("cbrt");
+  name_supply_->ReserveName("ceil");
+  name_supply_->ReserveName("copysign");
+  name_supply_->ReserveName("cos");
+  name_supply_->ReserveName("cosh");
+  name_supply_->ReserveName("cospi");
+  name_supply_->ReserveName("cyl_bessel_i0");
+  name_supply_->ReserveName("cyl_bessel_i1");
+  name_supply_->ReserveName("erf");
+  name_supply_->ReserveName("erfc");
+  name_supply_->ReserveName("erfcinv");
+  name_supply_->ReserveName("erfcx");
+  name_supply_->ReserveName("erfinv");
+  name_supply_->ReserveName("exp");
+  name_supply_->ReserveName("exp10");
+  name_supply_->ReserveName("exp2");
+  name_supply_->ReserveName("expm1");
+  name_supply_->ReserveName("fabs");
+  name_supply_->ReserveName("fdim");
+  name_supply_->ReserveName("floor");
+  name_supply_->ReserveName("fma");
+  name_supply_->ReserveName("fmax");
+  name_supply_->ReserveName("fmin");
+  name_supply_->ReserveName("fmod");
+  name_supply_->ReserveName("frexp");
+  name_supply_->ReserveName("hypot");
+  name_supply_->ReserveName("ilogb");
+  name_supply_->ReserveName("j0");
+  name_supply_->ReserveName("j1");
+  name_supply_->ReserveName("jn");
+  name_supply_->ReserveName("ldexp");
+  name_supply_->ReserveName("lgamma");
+  name_supply_->ReserveName("llrint");
+  name_supply_->ReserveName("llround");
+  name_supply_->ReserveName("log");
+  name_supply_->ReserveName("log10");
+  name_supply_->ReserveName("log1p");
+  name_supply_->ReserveName("log2");
+  name_supply_->ReserveName("logb");
+  name_supply_->ReserveName("lrint");
+  name_supply_->ReserveName("lround");
+  name_supply_->ReserveName("modf");
+  name_supply_->ReserveName("nan");
+  name_supply_->ReserveName("nearbyint");
+  name_supply_->ReserveName("nextafter");
+  name_supply_->ReserveName("norm");
+  name_supply_->ReserveName("norm3d");
+  name_supply_->ReserveName("norm4d");
+  name_supply_->ReserveName("normcdf");
+  name_supply_->ReserveName("normcdfinv");
+  name_supply_->ReserveName("pow");
+  name_supply_->ReserveName("rcbrt");
+  name_supply_->ReserveName("remainder");
+  name_supply_->ReserveName("remquo");
+  name_supply_->ReserveName("rhypot");
+  name_supply_->ReserveName("rint");
+  name_supply_->ReserveName("rnorm");
+  name_supply_->ReserveName("rnorm3d");
+  name_supply_->ReserveName("rnorm4d");
+  name_supply_->ReserveName("round");
+  name_supply_->ReserveName("rsqrt");
+  name_supply_->ReserveName("scalbln");
+  name_supply_->ReserveName("scalbn");
+  name_supply_->ReserveName("signbit");
+  name_supply_->ReserveName("sin");
+  name_supply_->ReserveName("sincos");
+  name_supply_->ReserveName("sincospi");
+  name_supply_->ReserveName("sinh");
+  name_supply_->ReserveName("sinpi");
+  name_supply_->ReserveName("sqrt");
+  name_supply_->ReserveName("tan");
+  name_supply_->ReserveName("tanh");
+  name_supply_->ReserveName("tgamma");
+  name_supply_->ReserveName("trunc");
+  name_supply_->ReserveName("y0");
+  name_supply_->ReserveName("y1");
+  name_supply_->ReserveName("yn");
+}
+
 void CodeGenTileLangCUDA::PrintFuncPrefix(std::ostream &os) {
   os << "extern \"C\" __global__ ";
 }
@@ -278,6 +467,9 @@ std::string CodeGenTileLangCUDA::Finish() {
   if (enable_fp8_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp8.h>\n";
   }
+  if (enable_fp4_) {
+    decl_stream << "#include <tl_templates/cuda/cuda_fp4.h>\n";
+  }
 
   if (need_math_constants_h_) {
     decl_stream << "#include <math_constants.h>\n";
@@ -287,6 +479,10 @@ std::string CodeGenTileLangCUDA::Finish() {
     decl_stream << "#include <cooperative_groups.h>\n";
   }
 
+  if (need_curand_kernel_h_) {
+    decl_stream << "#include <curand_kernel.h>\n";
+  }
+
   decl_stream << "#include <tl_templates/cuda/gemm.h>\n";
   if (enable_sparse_gemm_) {
     decl_stream << "#include <tl_templates/cuda/gemm_sp.h>\n";
@@ -312,7 +508,12 @@ std::string CodeGenTileLangCUDA::Finish() {
 void CodeGenTileLangCUDA::VisitStmt_(const tir::ForNode *op) {
   if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
-    stream << "#pragma unroll\n";
+    if (unroll_factor.count(op->loop_var.get())) {
+      stream << "#pragma unroll "
+             << PrintExpr(unroll_factor[op->loop_var.get()]) << "\n";
+    } else {
+      stream << "#pragma unroll\n";
+    }
   }
   std::string extent =
       PrintExpr(arith::Analyzer().Simplify(op->extent + op->min));
@@ -432,18 +633,20 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
       return;
   } else if (t.is_float8()) {
     enable_fp8_ = true;
-    os << GetFP8Type(t);
+    os << GetTileLangFP8Type(t);
     return;
   } else if (t.is_float6()) {
     enable_fp6_ = true;
     if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+      os << GetTileLangFP6Type(t);
     }
     return;
   } else if (t.is_float4()) {
     enable_fp4_ = true;
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (t.lanes() <= 64) {
+      os << GetTileLangFP4Type(t);
+    } else {
+      fail = true;
     }
     return;
   } else if (t == DataType::Bool()) {
@@ -660,7 +863,9 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < 256 / t.bits());
+  ICHECK(i >= 0 && i < 256 / t.bits())
+      << "i: " << i << " t: " << t << " t.bits(): " << t.bits()
+      << " t.lanes(): " << t.lanes();
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     std::string type_name = t.is_int() ? "char" : "unsigned char";
     if (t.lanes() == 2 || t.lanes() == 3) {
@@ -702,6 +907,25 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
       os << "." << access[(i % 8) / 4];
     // fp8_e5_4_t or fp8_e5_2_t
     os << "." << access[i % 4];
+  } else if (t.is_float4_e2m1fn()) {
+    os << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      os << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      os << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      os << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      os << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t -> fp4_e2_2_t member
+    if (t.lanes() >= 4)
+      os << "." << access[(i % 4) / 2];
+    // fp4_e2_2_t -> method call x() or y()
+    os << "." << access[i % 2] << "()";
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -805,6 +1029,25 @@ void CodeGenTileLangCUDA::PrintVecElemStore(const std::string &vec, DataType t,
     ICHECK(!type_name.empty());
     stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2]
            << ")))->" << access[i % 2] << " = " << value << ";\n";
+  } else if (t.is_float4_e2m1fn()) {
+    stream << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      stream << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      stream << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      stream << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      stream << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t -> fp4_e2_2_t member
+    if (t.lanes() >= 4)
+      stream << "." << access[(i % 4) / 2];
+    // fp4_e2_2_t -> set_x() or set_y()
+    stream << ".set_" << access[i % 2] << "(" << value << ");\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
@@ -870,7 +1113,7 @@ void CodeGenTileLangCUDA::PrintStorageScope(const std::string &scope,
       << "Cannot allocate global memory when targeting CUDA. You must pass "
          "all global arrays as input instead";
   if (scope == "shared" || scope == "shared.barrier") {
-    os << "__shared__ ";
+    os << "__shared__ __align__(" << barrier_alignment_bytes_ << ") ";
   } else if (scope == "shared.dyn") {
     os << "extern __shared__ __align__(1024) ";
   }
@@ -916,220 +1159,236 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   stream << ' ' << sret << ";\n";
   std::string src = SSAGetID(PrintExpr(op->value), from_ty);
 
-  // Handle conversion between float16 and float32
-  if (from_ty.is_float16() && target_ty.is_float()) {
+  int lanes = from_ty.lanes();
+
+  auto PrintVectorizedCast =
+      [&](const std::string &cast_func, const std::string &src_type,
+          const std::string &dst_type, const std::string &extra_args = "",
+          bool src_needs_reinterpret = false,
+          bool dst_needs_reinterpret = false) {
+        int num_chunks = lanes / 2;
+        std::string src_cast = src_needs_reinterpret
+                                   ? "reinterpret_cast<" + src_type + "*>"
+                                   : "(" + src_type + "*)";
+        std::string dst_cast = dst_needs_reinterpret
+                                   ? "reinterpret_cast<" + dst_type + "*>"
+                                   : "(" + dst_type + "*)";
+
+        for (int i = 0; i < num_chunks; i++) {
+          PrintIndent();
+          stream << "(" << dst_cast << "(&" << sret << "))[" << i
+                 << "] = " << cast_func << "((" << src_cast << "(&" << src
+                 << "))[" << i << "]" << extra_args << ");\n";
+        }
+        os << sret;
+      };
+
+  // A list of casting functions that are supported by TileLang templates.
+  // To add a new type conversion, you should do the following things:
+  // 1. Add the new conversion function in tl_templates. (__tl_cvt_xx)
+  // 2. Add a new if statement like the one below.
+  // 3. In src/target/utils.cc, allow this vectorizable cast.
+
+  // Handle conversion from float16 to float32
+  if (from_ty.is_float16() && target_ty.is_float() && target_ty.bits() == 32) {
     // Use __half22float2 for vectorized conversion (half2 -> float2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // half2 -> float2
-      PrintIndent();
-      stream << sret << " = __half22float2(*(half2*)(&(" << src << ")));\n";
-      os << sret;
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__half22float2", "half2", "float2");
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // half4 -> float4
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__half22float2(*(half2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__half22float2(*((half2*)(&(" << src << "))+1));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float32 to float16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_float16()) {
+    // Use __float22half2_rn for vectorized conversion (float2 -> half2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__float22half2_rn", "float2", "half2");
       return;
-    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
-      // half8 -> float8
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__half22float2(*(half2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__half22float2(*((half2*)(&(" << src << "))+1));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[2] = "
-             << "__half22float2(*((half2*)(&(" << src << "))+2));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[3] = "
-             << "__half22float2(*((half2*)(&(" << src << "))+3));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from bfloat16 to float32
+  if (from_ty.is_bfloat16() && target_ty.is_float() && target_ty.bits() == 32) {
+    // Use __bfloat1622float2 for vectorized conversion (bfloat162 -> float2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__bfloat1622float2", "__nv_bfloat162", "float2", "",
+                          true, false);
       return;
     }
-  } else if (from_ty.is_float() && target_ty.is_float16()) {
-    // Use __float22half2_rn for vectorized conversion (float2 -> half2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> half2
-      PrintIndent();
-      stream << "*(half2*)(&(" << sret << ")) = __float22half2_rn(*(float2*)(&("
-             << src << ")));\n";
-      os << sret;
+  }
+
+  // Handle conversion from float32 to bfloat16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_bfloat16()) {
+    // Use __float22bfloat162_rn for vectorized conversion (float2 -> bfloat162)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__float22bfloat162_rn", "float2", "__nv_bfloat162",
+                          "", false, true);
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> half4
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[0] = "
-             << "__float22half2_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[1] = "
-             << "__float22half2_rn(*((float2*)(&(" << src << "))+1));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float32 to float8 (E4M3/E5M2)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      tl::IsCudaVectorizableFP8(target_ty)) {
+    bool target_type_is_e4m3 =
+        target_ty.is_float8_e4m3() || target_ty.is_float8_e4m3fn();
+    std::string type_suffix = target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2";
+
+    // Use __nv_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> fp8x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      std::string extra_args = ", __NV_SATFINITE, " + type_suffix;
+      PrintVectorizedCast("__nv_cvt_float2_to_fp8x2", "float2",
+                          "__nv_fp8x2_storage_t", extra_args, false, true);
       return;
-    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
-      // float8 -> half8
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[0] = "
-             << "__float22half2_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[1] = "
-             << "__float22half2_rn(*((float2*)(&(" << src << "))+1));\n";
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[2] = "
-             << "__float22half2_rn(*((float2*)(&(" << src << "))+2));\n";
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[3] = "
-             << "__float22half2_rn(*((float2*)(&(" << src << "))+3));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float8 (E4M3/E5M2) to float32
+  if (tl::IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+      target_ty.bits() == 32) {
+    bool from_type_is_e4m3 =
+        from_ty.is_float8_e4m3() || from_ty.is_float8_e4m3fn();
+    std::string type_suffix = from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2";
+
+    // Use __tl_cvt_fp8x2_to_float2 for vectorized conversion (fp8x2 -> float2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp8x2_to_float2", "__nv_fp8x2_storage_t",
+                          "float2", ", " + type_suffix, true, false);
       return;
     }
   }
 
-  // Handle conversion between bfloat16 and float32
-  if (from_ty.is_bfloat16() && target_ty.is_float()) {
-    // Use __bfloat1622float2 for vectorized conversion (bfloat162 -> float2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // bfloat162 -> float2
-      PrintIndent();
-      stream << sret
-             << " = __bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&("
-             << src << ")));\n";
-      os << sret;
+  // Handle conversion from float8 (E8M0) to bfloat16
+  if (from_ty.is_float8_e8m0fnu() && target_ty.is_bfloat16()) {
+    // Use __tl_cvt_e8m0x2_to_bfloat162 for vectorized conversion (fp8_e8m0x2 ->
+    // bfloat162)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_e8m0x2_to_bfloat162",
+                          "__nv_fp8x2_storage_t", "__nv_bfloat162", "", true,
+                          false);
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // bfloat162x2 -> float4
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&("
-             << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&("
-             << src << "))+1));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from bfloat16 to float8 (E8M0)
+  if (from_ty.is_bfloat16() && target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_bfloat162_to_e8m0x2 for vectorized conversion (bfloat162 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_bfloat162_to_e8m0x2", "__nv_bfloat162",
+                          "__nv_fp8x2_storage_t", "", false, true);
       return;
-    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
-      // bfloat162x4 -> float8
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&("
-             << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&("
-             << src << "))+1));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[2] = "
-             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&("
-             << src << "))+2));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[3] = "
-             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&("
-             << src << "))+3));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float32 to float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_float2_to_e8m0x2 for vectorized conversion (float2 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_float2_to_e8m0x2", "float2",
+                          "__nv_fp8x2_storage_t", "", false, true);
       return;
     }
-  } else if (from_ty.is_float() && target_ty.is_bfloat16()) {
-    // Use __float22bfloat162_rn for vectorized conversion (float2 -> bfloat162)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> bfloat162
-      PrintIndent();
-      stream << "*reinterpret_cast<__nv_bfloat162*>(&(" << sret
-             << ")) = __float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
-      os << sret;
+  }
+
+  // Handle conversion from double to float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_double2_to_e8m0x2 for vectorized conversion (double2 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_double2_to_e8m0x2", "double2",
+                          "__nv_fp8x2_storage_t", "", false, true);
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> bfloat162x2
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[0] = "
-             << "__float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[1] = "
-             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+1));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float16 to float4 (E2M1)
+  if (from_ty.is_float16() && target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_half2_to_fp4x2 for vectorized conversion (half2 -> fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_half2_to_fp4x2", "half2", "uint8_t", "",
+                          false, true);
       return;
-    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
-      // float8 -> bfloat162x4
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[0] = "
-             << "__float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[1] = "
-             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+1));\n";
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[2] = "
-             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+2));\n";
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[3] = "
-             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+3));\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float32 to float4 (E2M1)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_float2_to_fp4x2 for vectorized conversion (float2 -> fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_float2_to_fp4x2", "float2", "uint8_t", "",
+                          false, true);
       return;
     }
   }
 
-  // Handle conversion from float32 to float8 (E4M3/E5M2)
-  if (from_ty.is_float() &&
-      (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2())) {
-    // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion
-    // (float2 -> fp8x2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> fp8x2
-      PrintIndent();
-      stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret
-             << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&("
-             << src << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      os << sret;
+  // Handle conversion from float4 (E2M1) to float16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float16()) {
+    // Use __tl_cvt_fp4x2_to_half2 for vectorized conversion (fp4x2 -> half2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_half2", "uint8_t", "half2", "",
+                          true, false);
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> fp8x4
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
-             << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
-             << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
-             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
-             << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to float32
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 32) {
+    // Use __tl_cvt_fp4x2_to_float2 for vectorized conversion (fp4x2 -> float2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_float2", "uint8_t", "float2", "",
+                          true, false);
       return;
-    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
-      // float8 -> fp8x8
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
-             << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
-             << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
-             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
-             << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[2] = "
-             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
-             << "))+2), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[3] = "
-             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
-             << "))+3), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      os << sret;
+    }
+  }
+
+  // Handle conversion from double to float4 (E2M1)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_double2_to_fp4x2 for vectorized conversion (double2 ->
+    // fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_double2_to_fp4x2", "double2", "uint8_t", "",
+                          false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to double
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 64) {
+    // Use __tl_cvt_fp4x2_to_double2 for vectorized conversion (fp4x2 ->
+    // double2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_double2", "uint8_t", "double2", "",
+                          true, false);
+      return;
+    }
+  }
+
+  // Handle conversion from bfloat16 to float4 (E2M1)
+  if (from_ty.is_bfloat16() && target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_bfloat162_to_fp4x2 for vectorized conversion (bfloat162 ->
+    // fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_bfloat162_to_fp4x2", "__nv_bfloat162",
+                          "uint8_t", "", false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to bfloat16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_bfloat16()) {
+    // Use __tl_cvt_fp4x2_to_bfloat162 for vectorized conversion (fp4x2 ->
+    // bfloat162)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_bfloat162", "uint8_t",
+                          "__nv_bfloat162", "", true, false);
       return;
     }
   }
@@ -1259,6 +1518,11 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
   const VarNode *buffer_var = buffer->data.get();
   std::ostringstream os;
   std::string vid = GetVarID(buffer_var);
+  // For fp4 packed buffers, use the packed buffer name for vector accesses
+  auto it = fp4_packed_buffers_.find(buffer_var);
+  if (it != fp4_packed_buffers_.end() && !t.is_scalar()) {
+    vid = it->second;
+  }
   std::string scope;
   if (alloc_storage_scope_.count(buffer_var)) {
     scope = alloc_storage_scope_.at(buffer_var);
@@ -1297,7 +1561,7 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     return os.str();
   }
   std::string index_str = PrintExpr(index);
-  if (t.bits() == 4 || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
@@ -1305,13 +1569,26 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     // int32.  Therefore, we need to divide by the ratio of their
     // sizes in that case.
     int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
-
-    os << "*("
-       << "(" << ptr_cast(t) << vid << ")"
-       << " + " << index_str << " / " << div_factor << ")";
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
+    os << "*((" << ptr_cast(t) << vid << ")" << " + " << index_str << ")";
   } else if (t == buffer_element_dtype) {
+    int div_factor = 1;
+    if (buffer_element_dtype.is_float4() && buffer_element_dtype.lanes() == 1) {
+      div_factor = 2;
+    }
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
     os << buffer_str << "[" << index_str << "]";
   } else {
+    // Fix fp4 pointer arithmetic: fp4 elements are 4-bit packed 2 per byte.
+    // fp4* + n incorrectly advances n bytes (skipping 2n elements).
+    int div_factor = 1;
+    if (buffer_element_dtype.is_float4() && buffer_element_dtype.lanes() == 1) {
+      div_factor = 2;
+    }
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
     os << "*" << ptr_cast(t) << "(" << buffer_str << " + " << index_str << ")";
   }
 
@@ -1337,7 +1614,7 @@ std::string CodeGenTileLangCUDA::GetVecLoad(DataType t,
       << "Unsupported vector load size: " << t.bits() * t.lanes();
   auto buffer_ref = this->GetBufferRef(t, buffer, base);
   std::ostringstream os;
-  os << "tl::ld_global_256(&(" << buffer_ref << "))";
+  os << "tl::load_global_256(&(" << buffer_ref << "))";
   return os.str();
 }
 
@@ -1361,7 +1638,7 @@ void CodeGenTileLangCUDA::PrintVecStore(const BufferNode *buffer, DataType t,
       << "Unsupported vector load size: " << t.bits() * t.lanes();
   auto buffer_ref = this->GetBufferRef(t, buffer, base);
   this->PrintIndent();
-  this->stream << "tl::st_global_256(&(" << buffer_ref << "), " << value
+  this->stream << "tl::store_global_256(&(" << buffer_ref << "), " << value
                << ");\n";
 }
 
@@ -1415,37 +1692,49 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->stream << ss.str();
     this->stream << ");\n";
   };
-  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
-    std::ostringstream ss;
-    if (barrier_id.as<IntImmNode>()) {
-      // incase the barrier_id is an integer, we need to print the barrier_id as
-      // an integer
-      ss << mbarrier_name_ << "[" << barrier_id << "]";
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
-      // we need to print the barrier_id as a string
-      ss << this->PrintExpr(barrier_id);
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
+      this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
+                   << ", " << src << ", " << condition << ");\n";
     }
-    return ss.str();
-  };
-  if (op->op.same_as(builtin::ptx_cp_async())) {
+  } else if (op->op.same_as(tl::ptx_cp_async())) {
+    // TileLang version: args[0] = dst_access_ptr, args[1] = src_access_ptr,
+    // args[2] = bytes, args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "tl::ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+
     std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->PrintIndent();
-      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << "+"
-                   << dst_offset << ", " << src << "+" << src_offset << ");\n";
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      std::string condition = this->PrintExpr(op->args[5]);
-      this->PrintIndent();
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
-                   << "+" << dst_offset << ", " << src << "+" << src_offset
-                   << ", " << condition << ");\n";
+                   << ", " << src << ", " << condition << ");\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl::cp_async_commit");
@@ -1457,23 +1746,25 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->PrintIndent();
     int barrier_count = Downcast<IntImm>(op->args[0])->value;
     auto mbarrier_storage_name = mbarrier_name_ + "_mem";
-    this->stream << "__shared__ uint64_t " << mbarrier_storage_name << "["
+    this->stream << "__shared__ __align__(" << barrier_alignment_bytes_
+                 << ") uint64_t " << mbarrier_storage_name << "["
                  << barrier_count << "];\n";
     this->PrintIndent();
     this->stream << "auto " << mbarrier_name_ << " = reinterpret_cast<"
                  << mbarrier_dtype_ << "*>(" << mbarrier_storage_name << ");\n";
   } else if (op->op.same_as(tl::get_mbarrier())) {
+    // get the mbarrier injected by compiler via barrier_id
     ICHECK_EQ(op->args.size(), 1);
     std::string barrier_id = this->PrintExpr(op->args[0]);
     os << mbarrier_name_ + "[" + barrier_id + "]";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
     if (op->args.size() == 1) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       this->stream << mbarrier_obj << ".arrive();\n";
     } else if (op->args.size() == 3) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       auto cta_id = this->PrintExpr(op->args[1]);
       auto pred = this->PrintExpr(op->args[2]);
       this->stream << mbarrier_obj << ".arrive(" << cta_id << ", " << pred
@@ -1485,19 +1776,19 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto arrive_count = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".init(" << arrive_count << ");\n";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
     if (op->args.size() == 2) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       auto transaction_bytes = this->PrintExpr(op->args[1]);
       this->stream << mbarrier_obj << ".arrive_and_expect_tx("
                    << transaction_bytes << ");\n";
     } else if (op->args.size() == 4) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       auto transaction_bytes = this->PrintExpr(op->args[1]);
       auto cta_id = this->PrintExpr(op->args[2]);
       auto pred = this->PrintExpr(op->args[3]);
@@ -1517,14 +1808,14 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto transaction_bytes = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".expect_transaction(" << transaction_bytes
                  << ");\n";
   } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto phase = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".wait(" << phase << ");\n";
   } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
@@ -1547,7 +1838,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     }
     auto desc = op->args[0];
     ss << this->PrintExpr(desc) << ", ";
-    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << this->PrintExpr(op->args[1]) << ", ";
     for (size_t i = 2; i < op->args.size() - 1; i++) {
       if (i > 2)
         ss << ", ";
@@ -1645,10 +1936,20 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::sync_grid())) {
     this->need_cooperative_groups_ = true;
     this->PrintIndent();
-    this->stream << "cooperative_groups::grid_group grid = "
-                    "cooperative_groups::this_grid();\n";
+    this->stream << "cooperative_groups::this_grid().sync();\n";
+  } else if (op->op.same_as(tl::sync_warp())) {
+    this->PrintIndent();
+    this->stream << "__syncwarp(";
+    if (!op->args.empty()) {
+      this->stream << this->PrintExpr(op->args[0]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::pdl_trigger())) {
+    this->PrintIndent();
+    this->stream << "cudaTriggerProgrammaticLaunchCompletion();\n";
+  } else if (op->op.same_as(tl::pdl_sync())) {
     this->PrintIndent();
-    this->stream << "grid.sync();\n";
+    this->stream << "cudaGridDependencySynchronize();\n";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";
@@ -2044,7 +2345,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
     ICHECK_EQ(op->args.size(), 14U)
         << "ptx_tcgen05_mma_ss args is " << op->args;
-    std::string C_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
     std::string a_desc = this->PrintExpr(op->args[1]);
     std::string A_offset = this->PrintExpr(op->args[2]);
     std::string b_desc = this->PrintExpr(op->args[3]);
@@ -2059,19 +2360,19 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string mask3 = this->PrintExpr(op->args[12]);
     bool enable_ws = Downcast<Bool>(op->args[13])->value;
 
-    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    auto dtype_enum = tl::codegen::ptx::DTypeFromString(kind_dtype);
 
     need_tcgen05mma_instruction_h_ = true;
     this->PrintIndent();
     std::string tcgen05_call =
-        "tl::(tcgen05_name)<(CType)>(uint64_t((desc_a) + (A_offset)), "
+        "tl::(tcgen05_name)<(ABType)>(uint64_t((desc_a) + (A_offset)), "
         "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
         "+ (C_offset), "
         "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
         "(mask2), (mask3));\n";
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(CType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(ABType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_enum));
     replacer.register_rule("(desc_a)", a_desc);
     replacer.register_rule("(A_offset)", A_offset);
     replacer.register_rule("(desc_b)", b_desc);
@@ -2111,14 +2412,14 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     need_tcgen05mma_instruction_h_ = true;
     this->PrintIndent();
     std::string tcgen05_call =
-        "tl::tcgen05mma_ts<(CType)>( (*reinterpret_cast<uint32_t*>((A))) + "
+        "tl::tcgen05mma_ts<(ABType)>( (*reinterpret_cast<uint32_t*>((A))) + "
         "(A_offset), "
         "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
         "+ (C_offset), "
         "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
         "(mask2), (mask3));\n";
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(CType)",
+    replacer.register_rule("(ABType)",
                            tl::codegen::ptx::DTypeEnumToString(dtype_enum));
     replacer.register_rule("(A)", a_ref);
     replacer.register_rule("(A_offset)", A_offset);
@@ -2166,7 +2467,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
          << "[(i % 8) / 4 * " + smem_stride +
                 " * 16 + (threadIdx.x % 4) * 4 * " + smem_stride +
                 "+ (i % 4) * " + smem_stride +
-                " + threadIdx.x / 4 +  (i / 8) * 8];\n";
+                " + threadIdx.x / 4 + (i / 8) * 8];\n";
       os << "}\n";
     } else {
       std::string smem_elem_offset = this->PrintExpr(op->args[6]);
@@ -2256,22 +2557,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     os << "for (int i = 0; i < " << num_elem << "; ++i) {\n";
     os << dst << "[" << dst_offset << " + i] = 0.0;";
     os << "}\n";
-  } else if (op->op.same_as(builtin::ptx_cp_async())) {
-    std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    need_cast_smem_ptr_to_int_ = true;
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->stream << PrintCpAsyncAssembly(dst, dst_offset, src, src_offset,
-                                           size);
-    } else {
-      this->stream << PrintPredicatedCpAsyncAssembly(
-          dst, dst_offset, src, src_offset, size, this->PrintExpr(op->args[5]));
-    }
   } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
     need_cast_smem_ptr_to_int_ = true;
     std::string dst = this->PrintExpr(op->args[0]);
@@ -2352,6 +2637,155 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
+  } else if (op->op.same_as(tl::ldg32())) {
+    // Explicit 32-bit global memory load: load_global_32(ptr) or
+    // load_global_32_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg32 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_32_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_32(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg64())) {
+    // Explicit 64-bit global memory load: load_global_64(ptr) or
+    // load_global_64_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg64 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_64_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_64(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg128())) {
+    // Explicit 128-bit global memory load: load_global_128(ptr) or
+    // load_global_128_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg128 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_128_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_128(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg256())) {
+    // Explicit 256-bit global memory load: load_global_256(ptr) or
+    // load_global_256_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg256 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_256_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_256(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg32())) {
+    // Explicit 32-bit global memory store: store_global_32(ptr, value) or
+    // store_global_32_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg32 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_32_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_32(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg64())) {
+    // Explicit 64-bit global memory store: store_global_64(ptr, value) or
+    // store_global_64_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg64 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_64_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_64(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg128())) {
+    // Explicit 128-bit global memory store: store_global_128(ptr, value) or
+    // store_global_128_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg128 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_128_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_128(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg256())) {
+    // Explicit 256-bit global memory store: store_global_256(ptr, value) or
+    // store_global_256_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg256 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_256_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_256(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;
@@ -2359,7 +2793,28 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
 
     // Handle float4_e2m1fn reinterpret
     if (!src_dtype.is_float4_e2m1fn() && !tgt_dtype.is_float4_e2m1fn()) {
-      return CodeGenC::VisitExpr_(op, os);
+      CHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+               src_dtype.lanes() * src_dtype.bits())
+          << "reinterpret expects source and target to have the same number of "
+             "bits";
+
+      std::string src_val = PrintExpr(value);
+      std::string rhs = SSAGetID(src_val, src_dtype);
+
+      // If SSAGetID returns the expression itself (happens when MarkConst was
+      // called for constants like -CUDART_INF_F), we need to create a temp
+      // variable because we cannot take the address of an rvalue.
+      if (rhs == src_val) {
+        rhs = name_supply_->FreshName("_reinterpret_tmp");
+        PrintIndent();
+        PrintType(src_dtype, stream);
+        stream << " " << rhs << " = " << src_val << ";\n";
+      }
+
+      os << "(*(";
+      this->PrintType(tgt_dtype, os);
+      os << " *)(&(" << rhs << ")))";
+      return;
     }
     if (src_dtype == tgt_dtype || tgt_dtype.lanes() * tgt_dtype.bits() ==
                                       src_dtype.lanes() * src_dtype.bits()) {
@@ -2381,7 +2836,15 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
       // The case of lane=1 is same as the normal reinterpret,
       // except that we allow the src and dst dtype to have different number of
       // bits.
-      std::string rhs = SSAGetID(PrintExpr(value), src_dtype);
+      std::string src_val = PrintExpr(value);
+      std::string rhs = SSAGetID(src_val, src_dtype);
+      // If SSAGetID returns the expression itself (constant), create temp var
+      if (rhs == src_val) {
+        rhs = name_supply_->FreshName("_reinterpret_tmp");
+        PrintIndent();
+        PrintType(src_dtype, stream);
+        stream << " " << rhs << " = " << src_val << ";\n";
+      }
       os << "(*(";
       this->PrintType(tgt_dtype, os);
       os << " *)(&(" << rhs << ")))";
@@ -2612,6 +3075,129 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::rng_init())) {
+    this->need_curand_kernel_h_ = true;
+    this->curand_random_generator_state =
+        name_supply_->FreshName("__random_generator_state");
+    this->curand_random_generator_state_type =
+        op->args[3].as<StringImmNode>()->value;
+    this->PrintIndent();
+    this->stream << op->args[3].as<StringImmNode>()->value << " "
+                 << this->curand_random_generator_state << ";\n";
+    this->PrintIndent();
+    this->stream << "curand_init(" << PrintExpr(op->args[0]) << ", "
+                 << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
+                 << ", &" << this->curand_random_generator_state << ");\n";
+    // Store state_var for later use by rng_rand
+  } else if (op->op.same_as(tl::rng_rand())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand(&" << this->curand_random_generator_state << ")";
+  } else if (op->op.same_as(tl::rng_rand_float())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand_" << op->args[0].as<StringImmNode>()->value;
+    if (op->dtype.bits() == 64) {
+      os << "_double";
+    }
+    os << "(&" << this->curand_random_generator_state << ")";
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::atomic_add_elem_op())) {
+    // atomic_add_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAdd(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_add_ret_elem_op())) {
+    // atomic_add_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicAddRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_addx2_elem_op())) {
+    // atomic_addx2_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx2(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_addx4_elem_op())) {
+    // atomic_addx4_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx4(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_load_elem_op())) {
+    // atomic_load_elem_op(src_ptr, memory_order) -> returns loaded value
+    os << "AtomicLoad(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_store_elem_op())) {
+    // atomic_store_elem_op(dst_ptr, value, memory_order)
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string value = PrintExpr(op->args[1]);
+    std::string memory_order = PrintExpr(op->args[2]);
+    this->PrintIndent();
+    this->stream << "AtomicStore(" << dst_ptr << ", " << value << ", "
+                 << memory_order << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_elem_op())) {
+    // atomic_max_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMax(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_ret_elem_op())) {
+    // atomic_max_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMaxRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_min_elem_op())) {
+    // atomic_min_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMin(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_min_ret_elem_op())) {
+    // atomic_min_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMinRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -2654,7 +3240,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
     this->VisitStmt(op->body);
     return;
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
   }
+
   CodeGenC::VisitStmt_(op);
 }
 
@@ -2685,8 +3276,15 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
   } else if (scope == "local.descriptor.tcgen05_instr") {
     stream << "tl::Tcgen05InstrDescriptor " << vid << ";\n";
   } else {
-    PrintStorageScope(scope, stream);
-    PrintType(op->dtype, stream);
+    // For FP4 scalar local buffers, we use packed storage type,
+    // so skip type declaration here (will be handled in the local scope section
+    // below)
+    bool is_fp4_scalar_local = op->dtype.is_float4() && op->dtype.is_scalar() &&
+                               (scope == "local" || scope.empty());
+    if (!is_fp4_scalar_local) {
+      PrintStorageScope(scope, stream);
+      PrintType(op->dtype, stream);
+    }
   }
 
   if (scope == "shared.dyn") {
@@ -2713,7 +3311,19 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
       stream << "auto " << vid << " = reinterpret_cast<" << mbarrier_dtype_
              << "*>(" << v_id_mem << ");\n";
     } else if (scope == "local") {
-      stream << ' ' << vid << '[' << constant_size << "];\n";
+      // For FP4 types, use packed storage type to avoid wasting registers.
+      // fp4_e2_t uses int8 as storage but only needs 4 bits per element.
+      // By using fp4_e2_2_t (which stores 2 fp4 values in 1 byte), we halve the
+      // storage.
+      if (op->dtype.is_float4() && op->dtype.is_scalar()) {
+        auto vid_packed = vid + "_packed";
+        stream << "fp4_e2_2_t " << vid_packed << '[' << (constant_size + 1) / 2
+               << "];\n";
+        // Record mapping from original buffer to packed buffer name
+        fp4_packed_buffers_[op->buffer_var.get()] = vid_packed;
+      } else {
+        stream << ' ' << vid << '[' << constant_size << "];\n";
+      }
     } else if (scope == "local.var") {
       PrimExpr init = tir::make_const(op->dtype, 0);
       auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
@@ -2764,8 +3374,14 @@ void CodeGenTileLangCUDA::VisitStmt_(const EvaluateNode *op) {
 
 void CodeGenTileLangCUDA::VisitExpr_(const RampNode *op, std::ostream &os) {
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
-  CHECK_LE(lanes, 4) << "Translate Ramp Node " << tvm::ffi::GetRef<Ramp>(op)
-                     << " with " << lanes << " lanes is not allowed.";
+  // TODO(chaofan): Comment the ramp lanes limit for now since we have
+  // LegalizeVectorizedLoop to automatically legalize vectorized loop whose
+  // width exceeds the limit. But we should add check here for safety in the
+  // future. The check should be aligned to certain bit width like 128bits or
+  // 256bits.
+
+  // CHECK_LE(lanes, 8) << "Translate Ramp Node " << tvm::ffi::GetRef<Ramp>(op)
+  //                    << "error: " << lanes << " exceeds max ramp lanes 8.";
   os << "(make_";
   PrintType(op->dtype, os);
   os << "(";
@@ -2790,6 +3406,14 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   Var buffer_var = op->buffer->data;
   DataType element_dtype = op->buffer->dtype;
 
+  // Check if this is a fp4 packed buffer access
+  auto packed_it = fp4_packed_buffers_.find(buffer_var.get());
+  if (packed_it != fp4_packed_buffers_.end() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index);
+    os << "tl_fp4_packed_load(" << packed_it->second << ", " << idx_str << ")";
+    return;
+  }
+
   int lanes = op->dtype.lanes();
   // declare type.
   if (value_dtype.lanes() == element_dtype.lanes()) {
@@ -2798,7 +3422,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    int ramp_lanes = value_dtype.lanes() / element_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index)) {
       const RampNode *ramp = index.as<RampNode>();
       ICHECK(ramp);
       can_vector_load = true;
@@ -2810,11 +3435,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
       // }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
-      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
-      // So we cannot vector load it.
-      can_vector_load = false;
-    }
     if (can_vector_load) {
       std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
@@ -2848,36 +3468,107 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   }
 }
 
+void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  // Check if this is a fp4 packed buffer access
+  auto packed_it = fp4_packed_buffers_.find(buffer_var.get());
+  if (packed_it != fp4_packed_buffers_.end() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index_expr);
+    std::string value = this->PrintExpr(op->value);
+    this->PrintIndent();
+    stream << "tl_fp4_packed_store(" << packed_it->second << ", " << idx_str
+           << ", " << value << ");\n";
+    return;
+  }
+
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string value = this->PrintExpr(op->value);
+    std::string ref =
+        this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+    this->PrintIndent();
+    stream << ref << " = " << value << ";\n";
+  } else {
+    arith::PVar<PrimExpr> base;
+    int ramp_lanes = value_dtype.lanes() / element_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index_expr)) {
+      std::string value = this->PrintExpr(op->value);
+      this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
+    } else {
+      // The assignment below introduces side-effect, and the resulting value
+      // cannot be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
+      // store elements separately
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      for (int i = 0; i < value_dtype.lanes(); ++i) {
+        this->PrintIndent();
+        DataType elem_type = value_dtype.element_of();
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          stream << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, stream);
+            }
+          }
+          PrintType(elem_type, stream);
+          stream << "*)" << vid << ')';
+        } else {
+          stream << vid;
+        }
+        stream << '[';
+        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        stream << "] = ";
+        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        stream << ";\n";
+      }
+      EndScope(vec_scope);
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8) {
-    if (lanes == 4) {
-      // make_int8x4
-      const int64_t *p = as_const_int(op->value);
-      ICHECK(p);
-      int64_t v = *p & 0xFF;
-      v = (v << 24) | (v << 16) | (v << 8) | v;
-      if (op->dtype.is_uint()) {
-        os << "(uint)" << v;
-      } else {
-        os << "(int)" << v;
-      }
-      return;
-    } else if (lanes == 32) {
-      // make_int8x32
-      const int64_t *p = as_const_int(op->value);
-      ICHECK(p);
-      int64_t v = *p & 0xFF;
-      v = (v << 24) | (v << 16) | (v << 8) | v;
-      if (op->dtype.is_uint()) {
-        os << "make_ulonglong4(" << v << ", " << v << ", " << v << ", " << v
-           << ")";
-      } else {
-        os << "make_longlong4(" << v << ", " << v << ", " << v << ", " << v
-           << ")";
+    const int64_t *p = as_const_int(op->value);
+    if (p) {
+      if (lanes == 4) {
+        // make_int8x4
+        ICHECK(p);
+        int64_t v = *p & 0xFF;
+        v = (v << 24) | (v << 16) | (v << 8) | v;
+        if (op->dtype.is_uint()) {
+          os << "(uint)" << v;
+        } else {
+          os << "(int)" << v;
+        }
+        return;
+      } else if (lanes == 32) {
+        // make_int8x32
+        const int64_t *p = as_const_int(op->value);
+        ICHECK(p);
+        int64_t v = *p & 0xFF;
+        v = (v << 24) | (v << 16) | (v << 8) | v;
+        if (op->dtype.is_uint()) {
+          os << "make_ulonglong4(" << v << ", " << v << ", " << v << ", " << v
+             << ")";
+        } else {
+          os << "make_longlong4(" << v << ", " << v << ", " << v << ", " << v
+             << ")";
+        }
+        return;
       }
-      return;
     }
   }
 
@@ -2943,7 +3634,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 4) {
     bool fail = false;
     const int64_t *p = as_const_int(op->value);
-    ICHECK(p);
+    ICHECK(p) << "BroadcastNode " << op << " value: " << op->value
+              << " is not a constant";
     int64_t v = *p & 0xF;
 
     if (lanes == 4) {
@@ -2986,6 +3678,59 @@ void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
     }
   }
 
+  if (auto call = op->value.as<CallNode>()) {
+    if (this->curand_random_generator_state_type ==
+        "curandStatePhilox4_32_10_t") {
+      if (call->op.same_as(tl::rng_rand()) && lanes == 4) {
+        os << "curand4(&" << this->curand_random_generator_state << ")";
+        return;
+      }
+      if (call->op.same_as(tl::rng_rand_float())) {
+        int bits = call->dtype.bits();
+        std::string dist = call->args[0].as<StringImmNode>()->value;
+        if (bits == 32) {
+          if (lanes == 4) {
+            os << "curand_" << dist << "4(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          } else if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2(&" << this->curand_random_generator_state
+               << ")";
+            return;
+          }
+
+        } else {
+          if (lanes == 2) {
+            os << "curand_" << dist << "2_double(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          }
+        }
+      }
+    } else if (this->curand_random_generator_state_type ==
+                   "curandStateMRG32k3a_t" ||
+               this->curand_random_generator_state_type ==
+                   "curandStateXORWOW_t") {
+      if (call->op.same_as(tl::rng_rand_float())) {
+        int bits = call->dtype.bits();
+        std::string dist = call->args[0].as<StringImmNode>()->value;
+        if (bits == 32) {
+          if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2(&" << this->curand_random_generator_state
+               << ")";
+            return;
+          }
+        } else {
+          if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2_double(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          }
+        }
+      }
+    }
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->dtype, os);
@@ -3212,6 +3957,24 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = func->HasNonzeroAttr(tl::attr::kHasGridSync);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt =
+          func->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
   os << " " << function_name << "(";
   for (size_t i = 0; i < func->params.size(); ++i) {
     tir::Var v = func->params[i];
@@ -3236,7 +3999,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, os);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        os << "const ";
+      }
       CodeGenC::PrintType(GetType(v), os);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3244,7 +4010,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3274,12 +4040,29 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   // clear previous generated state.
   this->InitFuncState(f);
   // reserve keywords
-  ReserveKeywordsAsUnique();
+  ReserveKeywordsAsUnique_();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = f->HasNonzeroAttr(tl::attr::kHasGridSync);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt = f->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -3307,7 +4090,10 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, stream);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        stream << "const ";
+      }
       CodeGenC::PrintType(GetType(v), stream);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3315,7 +4101,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 6f229f11d..91a996b40 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -11,6 +11,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "target/source/codegen_c.h"
 
@@ -57,6 +58,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
   void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
+  void VisitStmt_(const BufferStoreNode *op) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
@@ -64,6 +66,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
                               const PrimFunc &func, std::ostream &os);
 
 protected:
+  void ReserveKeywordsAsUnique_();
   virtual std::string GetBufferRef(DataType t, const BufferNode *buffer,
                                    PrimExpr index) final;
   void PrintCallExtern(Type ret_type, ffi::String global_symbol,
@@ -87,6 +90,9 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+  // Global curand state
+  std::string curand_random_generator_state;
+  std::string curand_random_generator_state_type;
 
   // whether enable fp16
   bool enable_fp16_{false};
@@ -122,6 +128,8 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   bool need_cast_smem_ptr_to_int_{false};
   // whether need cooperative_groups.h
   bool need_cooperative_groups_{false};
+  // whether need curand_kernel.h
+  bool need_curand_kernel_h_{false};
   // Op attribute map
   OpAttrMap<bool> op_need_warp_shuffle_ =
       Op::GetAttrMap<bool>("cuda.need_warp_shuffle");
@@ -131,6 +139,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   // The size of the barrier array in shared memory
   int barrier_count_ = -1;
   // The name of the mbarrier array in shared memory
+  // The same as injected_mbarrier_name_ in transform/common/mbarrier.h
   const std::string mbarrier_name_ = "mbarrier";
   // The type name of the mbarrier array
   const std::string mbarrier_dtype_ = "Barrier";
@@ -140,6 +149,9 @@ class CodeGenTileLangCUDA final : public CodeGenC {
 
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
+  // Map from VarNode to packed buffer variable name for fp4 packed storage
+  std::unordered_map<const VarNode *, std::string> fp4_packed_buffers_;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
   void PrintWmmaScope(const std::string &scope, DataType t,
diff --git a/src/target/codegen_cutedsl.cc b/src/target/codegen_cutedsl.cc
new file mode 100644
index 000000000..f4e084db9
--- /dev/null
+++ b/src/target/codegen_cutedsl.cc
@@ -0,0 +1,1345 @@
+/*!
+ * \file target/codegen_cutedsl.cc
+ */
+
+#include "codegen_cutedsl.h"
+#include "codegen_utils.h"
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/index_map.h>
+#include <tvm/tir/op.h>
+
+#include <cmath>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op/builtin.h"
+#include "arith/pattern_match.h"
+
+namespace tvm {
+namespace codegen {
+namespace {
+
+// The threshold of the loop extent to use cutlass.range_constexpr
+// Higher values would lead to DSLOptimizationWarning:
+// This static loop has 128 iterations, which may be very slow to compile,
+//  consider using `cutlass.range(..., unroll_full=True)` instead.
+const int64_t LOOP_UNROLL_THRESHOLD = 64;
+
+void ReplaceAll(std::string &str, const std::string &from,
+                const std::string &to) {
+  ICHECK(!from.empty()) << "ReplaceAll(): `from` must be non-empty";
+  auto pos = str.find(from);
+  while (pos != std::string::npos) {
+    str.replace(pos, from.size(), to);
+    pos = str.find(from, pos + to.size());
+  }
+}
+
+} // namespace
+
+CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
+  // Read fastmath configuration from current PassContext
+  auto pass_ctx = tvm::transform::PassContext::Current();
+
+  // Read tl.enable_fast_math config, default to false
+  enable_fastmath_ =
+      pass_ctx->GetConfig<Bool>(tl::kEnableFastMath, Bool(false)).value();
+}
+
+std::string CodeGenTileLangCuTeDSL::CanonicalizeFastmathFunctionName_(
+    const std::string &func_name) const {
+  static const std::unordered_map<std::string, std::string> kFastMathMap = {
+      {"divf", "tl.divf"},   {"exp", "tl.exp"},    {"expf", "tl.exp"},
+      {"exp2", "tl.exp2"},   {"exp2f", "tl.exp2"}, {"log", "tl.log"},
+      {"logf", "tl.log"},    {"log2", "tl.log2"},  {"log2f", "tl.log2"},
+      {"log10", "tl.log10"}, {"tan", "tl.tan"},    {"cos", "tl.cos"},
+      {"sin", "tl.sin"},     {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+      {"tanh", "tl.tanh"},   {"tanhf", "tl.tanh"},
+  };
+
+  auto it = kFastMathMap.find(func_name);
+  if (it != kFastMathMap.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+void CodeGenTileLangCuTeDSL::PrintFuncDecorator_(
+    std::ostream &os) { // NOLINT(*)
+  os << "@cute.kernel\n";
+}
+
+void CodeGenTileLangCuTeDSL::PreFunctionBody_(const PrimFunc &f) {
+  PrintIndent();
+  stream << "threadIdx = tl.ThreadIdx()" << "\n";
+  PrintIndent();
+  stream << "blockIdx = tl.BlockIdx()" << "\n";
+}
+
+namespace {
+std::string DTypeToString(DataType t) {
+  ICHECK(t.is_scalar()) << "unsupported type " << t;
+
+  if (t.is_void()) {
+    return "void";
+  }
+  if (t == tl::cuTensorMapType()) {
+    return "CUtensorMap";
+  }
+
+  int bits = t.bits();
+  std::string elem_type;
+  if (t.is_float()) {
+    if (bits == 16 || bits == 32 || bits == 64) {
+      elem_type = "Float" + std::to_string(bits);
+    }
+  } else if (t.is_bfloat16()) {
+    elem_type = "BFloat16";
+  } else if (t.is_float8()) {
+    if (t.is_float8_e3m4()) {
+      // unsupported
+    } else if (t.is_float8_e4m3()) {
+      elem_type =
+          "Float8E4M3FN"; // Only Float8E4M3FN is supported at the moment
+    } else if (t.is_float8_e4m3b11fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e4m3fn()) {
+      elem_type = "Float8E4M3FN";
+    } else if (t.is_float8_e4m3fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e5m2()) {
+      elem_type = "Float8E5M2";
+    } else if (t.is_float8_e5m2fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e8m0fnu()) {
+      elem_type = "Float8E8M0FNU";
+    }
+  } else if (t.is_float6()) {
+    if (t.is_float6_e3m2fn()) {
+      elem_type = "Float6E3M2FN";
+    } else if (t.is_float6_e2m3fn()) {
+      elem_type = "Float6E2M3FN";
+    }
+  } else if (t.is_float4()) {
+    if (t.is_float4_e2m1fn()) {
+      elem_type = "Float4E2M1FN";
+    }
+  } else if (t.is_bool()) {
+    elem_type = "Boolean";
+  } else if (t.is_uint()) {
+    if (bits == 8 || bits == 16 || bits == 32 || bits == 64 || bits == 128) {
+      elem_type = "Uint" + std::to_string(bits);
+    }
+  } else if (t.is_int()) {
+    if (bits == 4 || bits == 8 || bits == 16 || bits == 32 || bits == 64 ||
+        bits == 128) {
+      elem_type = "Int" + std::to_string(bits);
+    }
+  }
+
+  if (elem_type.empty()) {
+    LOG(FATAL) << "Cannot convert type " << t << " to CuTeDSL type!";
+  }
+
+  return "cutlass." + elem_type;
+}
+} // namespace
+
+void CodeGenTileLangCuTeDSL::PrintType(DataType t,
+                                       std::ostream &os) { // NOLINT(*)
+  CHECK(t.is_scalar()) << "Should not print a non-scalar type in CuTeDSL: "
+                       << t;
+  os << DTypeToString(t);
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BroadcastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  os << "tl.make_filled_tensor((" << PrintExpr_(op->lanes) << ",), "
+     << PrintExpr_(op->value) << ").load()";
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32:
+  case 16:
+  case 8:
+  case 4: {
+    std::ostringstream temp;
+    if (std::isinf(op->value)) {
+      // For CuTeDSL, use Python's float('inf') instead of CUDA macros
+      PrintType(op->dtype, temp);
+      temp << "(";
+      if (op->value < 0) {
+        temp << "float('-inf')";
+      } else {
+        temp << "float('inf')";
+      }
+      temp << ")";
+    } else if (std::isnan(op->value)) {
+      // For CuTeDSL, use Python's float('nan')
+      PrintType(op->dtype, temp);
+      temp << "(float('nan'))";
+    } else {
+      // For CuTeDSL, use Python's float.fromhex() with hexfloat for full
+      // precision
+      PrintType(op->dtype, temp);
+      temp << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    }
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  DataType from_ty = op->value.dtype();
+  DataType target_ty = op->dtype;
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
+
+  if (from_ty.is_scalar())
+    return CodeGenTileLangPY::VisitExpr_(op, os);
+
+  // Emit this as vectorized unary ops.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << target_ty.lanes() << ",), ";
+  PrintType(target_ty.element_of(), stream);
+  stream << ")\n";
+
+  std::string src = SSAGetID(PrintExpr_(op->value), from_ty);
+
+  PrintIndent();
+  stream << sret << ".store(" << src << ".to(";
+  PrintType(target_ty.element_of(), stream);
+  stream << "))\n";
+  os << sret << ".load()";
+  return;
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const DivNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    if (enable_fastmath_) {
+      os << "tl.divf(" << PrintExpr_(op->a) << ", " << PrintExpr_(op->b)
+         << ", fastmath=True)";
+    } else {
+      PrintBinaryExpr_("tl.divf", op->dtype, op->a, op->b, os);
+    }
+  }
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MinNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MaxNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.max", op->dtype, op->a, op->b, os);
+}
+
+/**
+ * @brief Emit CuTeDSL-specific code for a call expression.
+ *
+ * This visitor handles CallNode intrinsics and builtins that require emitting
+ * CuTeDSL-specific code (inline PTX/ASM sequences, TensorLanguage runtime
+ * calls, WMMA/TMA helpers, barriers, cp.async primitives, index-map based
+ * stores, reinterpret/packing helpers, and various mma/ldmatrix patterns). The
+ * function writes the generated code to the provided output stream and falls
+ * back to the Python codegen for unrecognized calls.
+ *
+ * The method recognizes and emits code for (non-exhaustive): cp.async and its
+ * commit/wait variants, tma_load/store and im2col variants, ptX
+ * ldmatrix/stmatrix helpers, mbarrier APIs, cooperative grid sync, WMMA/legacy
+ * MMA intrinsics (fill/load/store/mma/bmma/ptx_mma/ptx_mma_sp), low-level PTX
+ * asm helpers (ldg32, cp_async bulk/init/arrive/wait barriers), reinterpret
+ * paths for special small-float encodings (e.g., float4 e2m1fn), tl::tl_gemm
+ * and related external calls, and other TL runtime calls.
+ *
+ * Side effects:
+ * - Emits to `os` and the internal codegen output stream.
+ * - May set internal feature flags (e.g., need_cooperative_groups_).
+ * - May open/close SSA scopes and mutate internal variable mappings.
+ * - May call LOG(FATAL) / CHECK / ICHECK on invalid or unsupported argument
+ *   patterns.
+ *
+ * @param op The call node to generate code for; the function inspects op->op
+ *           and op->args to determine the appropriate emission.
+ * @param os  Output stream to receive expression-level output when the caller
+ *            expects an expression result (some paths write directly to the
+ *            member stream instead).
+ */
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  auto print_extern_call_stmt = [&](std::string name, size_t start = 0,
+                                    size_t end = 0) {
+    // Cache context into a private ss, otherwise the let node may generate
+    // within the function call arguments.
+    std::ostringstream ss;
+    for (size_t i = start; i < op->args.size() - end; i++) {
+      if (i > start)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+
+    PrintIndent();
+    stream << name << "(";
+    stream << ss.str();
+    stream << ")\n";
+  };
+
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+
+    std::string dst = PrintExpr_(op->args[0]);
+    std::string src = PrintExpr_(op->args[1]);
+    std::string size = PrintExpr_(op->args[2]);
+
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << src
+             << ")\n";
+    } else {
+      std::string condition = PrintExpr_(op->args[3]);
+      PrintIndent();
+      stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
+             << src << ", " << condition << ")\n";
+    }
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    print_extern_call_stmt("tl.cp_async_commit");
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    print_extern_call_stmt("tl.cp_async_wait");
+  } else if (op->op.same_as(builtin::create_barriers())) {
+    PrintIndent();
+    int barrier_count = Downcast<IntImm>(op->args[0])->value;
+    stream << mbarrier_name_
+           << " = tl.alloc_smem(cutlass.Uint64, size_in_elems=" << barrier_count
+           << ")\n";
+  } else if (op->op.same_as(tl::get_mbarrier())) {
+    ICHECK_EQ(op->args.size(), 1);
+    std::string barrier_id = PrintExpr_(op->args[0]);
+    os << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+    if (op->args.size() == 1) {
+      PrintIndent();
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
+    } else if (op->args.size() == 3) {
+      PrintIndent();
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
+      auto cta_id = PrintExpr_(op->args[1]);
+      auto pred = PrintExpr_(op->args[2]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ", "
+             << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
+    auto arrive_count = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_init(" << mbarrier_obj << ", " << arrive_count
+           << ")\n";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
+    if (op->args.size() == 2) {
+      PrintIndent();
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ")\n";
+    } else if (op->args.size() == 4) {
+      PrintIndent();
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      auto cta_id = PrintExpr_(op->args[2]);
+      auto pred = PrintExpr_(op->args[3]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ", " << cta_id << ", " << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier_expect_tx "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive");
+  } else if (op->op.same_as(tl::ptx_fence_barrier_init())) {
+    print_extern_call_stmt("tl.fence_barrier_init");
+  } else if (op->op.same_as(tl::ptx_cp_async_barrier_noinc())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive_noinc");
+  } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
+    auto transaction_bytes = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_expect_tx(" << mbarrier_obj << ", "
+           << transaction_bytes << ")\n";
+  } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
+    auto phase = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_wait(" << mbarrier_obj << ", " << phase << ")\n";
+  } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    // do nothing
+  } else if (op->op.same_as(tl::tma_load())) {
+    std::ostringstream ss;
+    ICHECK_GE(op->args.size(), 2);
+    auto pol = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(pol) << "Eviction policy must be IntImm";
+    ICHECK_GE(pol->value, 0);
+    ICHECK_LT(static_cast<size_t>(pol->value), eviction_policy_names_.size());
+    auto eviction_policy = eviction_policy_names_[pol->value];
+    // Simplify the code by using the default eviction policy
+    if (eviction_policy != "EVICT_NORMAL") {
+      LOG(FATAL) << "Eviction policy " << eviction_policy
+                 << " is not supported currently";
+    } else {
+      ss << "tl.tma_load(";
+    }
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", ";
+    ss << PrintExpr_(op->args[2]) << ", (";
+    for (size_t i = 3; i < op->args.size() - 1; i++) {
+      if (i > 3)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << "))\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::tma_load_im2col())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tma_store())) {
+    std::stringstream ss;
+    // Check minimum argument count (desc, data, at least one coord,
+    // need_reduce, eviction)
+    ICHECK_GE(op->args.size(), 4) << "tma_store requires at least 4 arguments "
+                                     "(desc, data, coords..., need_reduce, "
+                                     "eviction_policy), got "
+                                  << op->args.size();
+
+    // Safely extract need_reduce flag
+    auto need_reduce_ptr = op->args[op->args.size() - 2].as<IntImmNode>();
+    ICHECK(need_reduce_ptr)
+        << "tma_store need_reduce flag (args[-2]) must be IntImm, got "
+        << op->args[op->args.size() - 2]->GetTypeKey();
+    auto need_reduce = need_reduce_ptr->value;
+    if (need_reduce) {
+      LOG(FATAL) << "Currently unsupported op: " << op->op;
+    }
+
+    // Safely extract and validate eviction policy index
+    auto eviction_idx_ptr = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(eviction_idx_ptr)
+        << "tma_store eviction policy (args[-1]) must be IntImm, got "
+        << op->args[op->args.size() - 1]->GetTypeKey();
+    ICHECK_GE(eviction_idx_ptr->value, 0)
+        << "tma_store eviction policy index must be >= 0, got "
+        << eviction_idx_ptr->value;
+    ICHECK_LT(static_cast<size_t>(eviction_idx_ptr->value),
+              eviction_policy_names_.size())
+        << "tma_store eviction policy index " << eviction_idx_ptr->value
+        << " out of bounds (max " << eviction_policy_names_.size() - 1 << ")";
+    auto eviction_policy = eviction_policy_names_[eviction_idx_ptr->value];
+
+    ss << "tl.tma_store(";
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", (";
+    for (size_t i = 2; i < op->args.size() - 2; i++) {
+      if (i > 2)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << ")";
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << ", eviction_kind = nvvm.EvictKind." << eviction_policy.substr(6);
+    }
+    ss << ")\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::ptx_ldmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_ldmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::ptx_stmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_stmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::fence_proxy_async())) {
+    print_extern_call_stmt("tl.fence_proxy_async");
+  } else if (op->op.same_as(tl::tma_store_arrive())) {
+    print_extern_call_stmt("tl.tma_store_arrive");
+  } else if (op->op.same_as(tl::tma_store_wait())) {
+    PrintIndent();
+    stream << "tl.tma_store_wait(0)\n";
+  } else if (op->op.same_as(tl::warpgroup_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_wait())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::set_max_nreg())) {
+    PrintIndent();
+    int nreg = Downcast<IntImm>(op->args[0])->value;
+    int is_inc = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name =
+        is_inc ? "tl.warpgroup_reg_alloc" : "tl.warpgroup_reg_dealloc";
+    stream << func_name << "(" << nreg << ")\n";
+  } else if (op->op.same_as(tl::wait_wgmma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::pack_b16())) {
+    os << "tl.pack_half2(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::sync_grid())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::loop_break())) {
+    PrintIndent();
+    stream << "break\n";
+  } else if (op->op.same_as(builtin::ptx_mma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_mma_sm70())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_mma_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_store())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_wait_barrier())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldg32())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    ICHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+              src_dtype.lanes() * src_dtype.bits())
+        << "reinterpret expects source and target to have the same number of "
+           "bits";
+
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+
+    PrimExpr index = load->indices[0];
+    if (const RampNode *node = index.as<RampNode>(); node) {
+      auto *p_stride = as_const_int(node->stride);
+      CHECK(p_stride);
+      ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
+      index = node->base;
+    }
+
+    auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
+    os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
+    PrintType(tgt_dtype.element_of(), os);
+    os << "), (" << tgt_dtype.lanes() << ",)).load()";
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    PrintCallExtern_(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                     op_instance->value, op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_lane_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx_sync())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_group_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tl_shuffle_elect())) {
+    os << "tl.shuffle_elect(" << PrintExpr_(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::increase_descriptor_offset())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__exp())) {
+    os << "tl.exp2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__exp10())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__log())) {
+    os << "tl.log(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log2())) {
+    os << "tl.log2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log10())) {
+    os << "tl.log10(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__tan())) {
+    os << "tl.tan(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__cos())) {
+    os << "tl.cos(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__sin())) {
+    os << "tl.sin(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::ieee_add())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_sub())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_mul())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fmaf())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frcp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fdiv())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::address_of())) {
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+    os << GetBufferPtr_(load->buffer.get(), load->indices[0]);
+  } else {
+    CodeGenTileLangPY::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  const int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index);
+    if (ref.back() == ')') {
+      ref += ".load()";
+    }
+    os << ref;
+  } else {
+    ICHECK_GE(value_lanes, element_dtype.lanes())
+        << "Unsupported load/store: value lanes < buffer element lanes";
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      std::string ref =
+          GetBufferRef_(value_dtype, op->buffer.get(), base.Eval());
+      if (ref.back() == ')') {
+        ref += ".load()";
+      }
+      os << ref;
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous load must be scalar "
+             "currently";
+
+      std::string sret = name_supply_->FreshName("_");
+      PrintIndent();
+      stream << sret << " = tl.make_rmem_tensor((" << value_lanes << ",), ";
+      PrintType(element_dtype, stream);
+      stream << ")\n";
+
+      std::string vid = GetVarID(buffer_var.get());
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp)
+          << "Expected Ramp index for vectorized non-contiguous access";
+      for (int i = 0; i < value_lanes; ++i) {
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << sret << "[" << i << "] = "
+               << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << "\n";
+      }
+      os << sret << ".load()";
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  std::string value_str = PrintExpr_(op->value);
+
+  int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+    PrintIndent();
+
+    if (ref.back() != ')') {
+      stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+    } else {
+      stream << ref << ".store(" << RemoveOutermostParentheses(value_str)
+             << ")\n";
+    }
+  } else {
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index_expr)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      PrintVecStore_(op->buffer.get(), value_dtype, base.Eval(), value_str);
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous store must be scalar "
+             "currently";
+
+      // store elements separately
+      value_str = SSAGetID(value_str, element_dtype);
+      for (int i = 0; i < value_lanes; ++i) {
+        const RampNode *ramp = index_expr.as<RampNode>();
+        ICHECK(ramp);
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << " = ";
+        PrintVecElemLoad_(value_str, value_dtype, i, stream);
+        stream << "\n";
+      }
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  PrintIndent();
+  std::string scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  if (scope == "local.descriptor.wgmma") {
+    stream << vid << " = tl.GmmaDescriptor()\n";
+  } else if (scope == "local.descriptor.tcgen05_smem") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "local.descriptor.tcgen05_instr") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "shared.dyn") {
+    stream << vid << " = tl.make_tensor(tl.get_dyn_smem(";
+    PrintType(op->dtype, stream);
+    // there is no bound check for Tensor access, so just set shape to 1
+    stream << ", alignment=1024), (1,))\n";
+  } else {
+    size_t constant_size = op->ConstantAllocationSize();
+    ICHECK_GT(constant_size, 0)
+        << "Can only handle constant size stack allocation for now, but get "
+        << constant_size << " for " << op->buffer_var->name_hint;
+
+    if (scope == "shared") {
+      stream << vid << " = tl.make_tensor(tl.alloc_smem(";
+      PrintType(op->dtype, stream);
+      stream << ", " << constant_size << "), (" << constant_size << ",))\n";
+    } else if (scope == "shared.barrier") {
+      stream << vid << " = tl.alloc_smem(cutlass.Uint64, size_in_elems="
+             << constant_size << ")\n";
+    } else if (scope == "local") {
+      stream << vid << " = tl.make_rmem_tensor((" << constant_size << "),";
+      PrintType(op->dtype, stream);
+      stream << ")\n";
+    } else if (scope == "local.var") {
+      PrimExpr init = tir::make_const(op->dtype, 0);
+      auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
+      if (init_it != op->annotations.end()) {
+        PrimExpr user_init = Downcast<PrimExpr>((*init_it).second);
+        if (!user_init.dtype().is_void() && user_init.dtype() != op->dtype) {
+          user_init = tir::Cast(op->dtype, user_init);
+        }
+        init = user_init;
+      }
+      stream << vid << " = " << PrintExpr_(init) << "\n";
+    } else {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    }
+  }
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
+  if (op->attr_key == tir::attr::thread_extent) {
+    IterVar iv = Downcast<IterVar>(op->node);
+    if (!iv->thread_tag.empty()) {
+      if (!var_idmap_.count(iv->var.get())) {
+        BindThreadIndex_(iv);
+      }
+    }
+    VisitStmt(op->body);
+  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
+    const IntImmNode *queue_id = op->value.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    VisitStmt(op->body);
+    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
+    VisitExpr(commit_group, stream);
+  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
+    auto wait_attrs = GetAsyncWaitAttributes(op);
+    auto queue_id = wait_attrs.first.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    auto wait_cnt = wait_attrs.second;
+    auto wait_group =
+        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
+    VisitExpr(wait_group, stream);
+    auto inner = op->body.as<AttrStmtNode>();
+    ICHECK(inner);
+    VisitStmt(inner->body);
+  } else if (op->attr_key == "threadblock_swizzle_pattern") {
+    this->PrintIndent();
+    const StringImmNode *pattern = op->value.as<StringImmNode>();
+    ICHECK(pattern);
+    std::string call_str = pattern->value;
+    // replace :: with . and replace < with ( and replace > with )
+    ReplaceAll(call_str, "::", ".");
+    ReplaceAll(call_str, "<", "(");
+    ReplaceAll(call_str, ">", ")");
+    this->stream << "blockIdx = " << call_str << "\n";
+    this->VisitStmt(op->body);
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor_[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
+    CodeGenTileLangPY::VisitStmt_(op);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
+  if (op->kind != tir::ForKind::kUnrolled) {
+    CodeGenTileLangPY::VisitStmt_(op);
+    return;
+  }
+
+  auto start_expr = arith::Analyzer().Simplify(op->min);
+  auto stop_expr = arith::Analyzer().Simplify(op->extent + op->min);
+  std::string unroll_factor;
+  if (auto it = unroll_factor_.find(op->loop_var.get());
+      it != unroll_factor_.end()) {
+    unroll_factor = PrintExpr_(it->second);
+  }
+  bool use_range_constexpr = unroll_factor.empty() &&
+                             as_const_int(op->extent) != nullptr &&
+                             *as_const_int(op->extent) <= LOOP_UNROLL_THRESHOLD;
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in cutlass.range";
+  if (use_range_constexpr) {
+    stream << "_constexpr";
+  }
+  stream << "(";
+  if (!is_zero(start_expr)) {
+    PrintExpr_(start_expr, stream);
+    stream << ", ";
+  }
+  PrintExpr_(stop_expr, stream);
+  if (!unroll_factor.empty()) {
+    stream << ", unroll=" << unroll_factor;
+  } else if (!use_range_constexpr) {
+    stream << ", unroll_full=True";
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  if (const CallNode *call = op->condition.as<CallNode>();
+      call && call->op.same_as(tl::tl_shuffle_elect())) {
+    PrintIndent();
+    stream << "with cute.arch.elect_one():\n";
+    int with_scope = BeginScope();
+    PrintStmt_(op->then_case);
+    EndScope(with_scope);
+  } else {
+    PrintStmt_(op->then_case);
+  }
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+  const CallNode *call = op->value.as<CallNode>();
+  if (call && call->op.same_as(builtin::tvm_global_barrier_kinit())) {
+    LOG(FATAL) << "Currently unsupported op: " << call->op;
+  }
+  if (call && (call->op.same_as(tvm::tl::device_assert()))) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    PrintIndent();
+    stream << "assert " << cond << "\n";
+  } else if (call && call->op.same_as(tvm::tl::device_assert_with_msg())) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    std::string msg_expr = PrintExpr_(call->args[1]);
+    PrintIndent();
+    stream << "assert " << cond << ", " << msg_expr << "\n";
+  } else if (call && call->op.same_as(builtin::tvm_storage_sync())) {
+    PrintStorageSync_(call);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemLoad_(const std::string &vec,
+                                               DataType t, int i,
+                                               std::ostream &os) { // NOLINT(*)
+  if (t.is_scalar()) {
+    os << vec;
+    return;
+  }
+  os << vec << "[" << i << "]";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemStore_(const std::string &vec,
+                                                DataType t, int i,
+                                                const std::string &value) {
+  PrintIndent();
+  stream << vec << "[" << i << "] = " << value << "\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecStore_(const BufferNode *buffer,
+                                            DataType t, PrimExpr base,
+                                            const std::string &value) {
+  ICHECK(!t.is_scalar()) << "PrintVecStore_() should not be used for scalar";
+
+  std::string ref = GetBufferRef_(t, buffer, base);
+  PrintIndent();
+  stream << ref << ".store(" << value << ")\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecBinaryOp_(const std::string &opstr,
+                                               DataType dtype, PrimExpr lhs,
+                                               PrimExpr rhs,
+                                               std::ostream &os) { // NOLINT(*)
+  // Declare the result.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << dtype.lanes() << ",), ";
+  PrintType(dtype.element_of(), stream);
+  stream << ")\n";
+
+  std::string vlhs = SSAGetID(PrintExpr_(lhs), lhs.dtype());
+  std::string vrhs = SSAGetID(PrintExpr_(rhs), rhs.dtype());
+
+  const std::string one_char_op{"+-*%<>^|&"};
+  const std::string two_char_op{"// == != <= >="};
+  if ((opstr.size() == 1 && one_char_op.find(opstr) != std::string::npos) ||
+      (opstr.size() == 2 && two_char_op.find(opstr) != std::string::npos)) {
+    PrintIndent();
+    stream << sret << ".store(" << vlhs << " " << opstr << " " << vrhs << ")\n";
+  } else {
+    // Unpack into individual ops.
+    for (int i = 0, lanes = dtype.lanes(); i < lanes; ++i) {
+      std::ostringstream value_temp;
+      if (isalpha(opstr[0])) {
+        value_temp << opstr << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << ", ";
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      } else {
+        value_temp << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << opstr;
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      }
+      PrintVecElemStore_(sret, dtype, i, value_temp.str());
+    }
+  }
+  os << sret << ".load()";
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryExpr_(const std::string &opstr,
+                                              DataType dtype, PrimExpr lhs,
+                                              PrimExpr rhs,
+                                              std::ostream &os) { // NOLINT(*)
+  if (dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryExpr_(opstr, dtype, lhs, rhs, os);
+  } else {
+    PrintVecBinaryOp_(opstr, dtype, lhs, rhs, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryIntrinsic_(
+    const CallNode *op, const char *opstr,
+    std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryIntrinsic_(op, opstr, os);
+  } else {
+    PrintVecBinaryOp_(opstr, op->dtype, op->args[0], op->args[1], os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintCallExtern_(Type ret_type,
+                                              ffi::String global_symbol,
+                                              const ffi::Array<PrimExpr> &args,
+                                              bool skip_first_arg,
+                                              std::ostream &os) { // NOLINT(*)
+  DataType ret_dtype = GetRuntimeDataType(ret_type);
+
+  std::string global_symbol_str = global_symbol;
+  ReplaceAll(global_symbol_str, "::", ".");
+
+  std::vector<std::string> sargs;
+  // when the template arguments occurs at the end, merge them with function
+  // arguments
+  if (global_symbol_str.back() == '>') {
+    auto pos = global_symbol_str.rfind('<');
+    ICHECK(pos != std::string::npos);
+    std::string template_args =
+        global_symbol_str.substr(pos + 1, global_symbol_str.size() - pos - 2);
+    ReplaceAll(template_args, "true", "True");
+    ReplaceAll(template_args, "false", "False");
+    sargs.push_back(template_args);
+
+    global_symbol_str.resize(pos);
+  }
+  const size_t arg_begin = static_cast<size_t>(skip_first_arg);
+  for (size_t i = arg_begin; i < args.size(); ++i) {
+    std::string sarg = PrintExpr_(args[i]);
+    if (ret_dtype.is_fixed_length_vector()) {
+      std::string val = SSAGetID(sarg, args[i].dtype());
+      sargs.push_back(std::move(val));
+    } else {
+      sargs.push_back(sarg);
+    }
+  }
+
+  // Replace "<...>" with "(...)". Nested "<" is not supported
+  {
+    auto pos_left = global_symbol_str.find('<');
+    while (pos_left != std::string::npos) {
+      auto pos_right = global_symbol_str.find('>', pos_left + 1);
+      if (pos_right != std::string::npos) {
+        auto args =
+            global_symbol_str.substr(pos_left + 1, pos_right - pos_left - 1);
+        ReplaceAll(args, "true", "True");
+        ReplaceAll(args, "false", "False");
+        global_symbol_str.replace(pos_left, args.size() + 2, "(" + args + ")");
+      }
+      pos_left = global_symbol_str.find('<');
+    }
+  }
+
+  // Special cases:
+  // Map C math functions to Python/cutedsl equivalents
+  const auto canonicalized_global_symbol_str =
+      CanonicalizeFastmathFunctionName_(global_symbol_str);
+  const bool canonicalized = !canonicalized_global_symbol_str.empty();
+  if (canonicalized) {
+    global_symbol_str = canonicalized_global_symbol_str;
+  }
+
+  // Atomic Functions
+  if (global_symbol_str.substr(0, 6) == "Atomic") {
+    global_symbol_str = "tl." + global_symbol_str;
+    // Convert first argument (Buffer) to pointer for atomic operations
+    if (const BufferLoadNode *load = args[arg_begin].as<BufferLoadNode>()) {
+      ICHECK_EQ(load->indices.size(), 1)
+          << "CodeGenTileLangCuTeDSL only supports flat memory";
+      sargs[0] = GetBufferPtr_(load->buffer.get(), load->indices[0]);
+    }
+  }
+  // some optional template arguments might be ommited, so add names explicitly
+  // for remain arguments
+  if (global_symbol_str == "tl.gemm_ss" || global_symbol_str == "tl.gemm_rs" ||
+      global_symbol_str == "tl.gemm_sr" || global_symbol_str == "tl.gemm_rr") {
+    ICHECK(sargs.size() >= 3);
+    sargs[sargs.size() - 3] = "A_ptr=" + sargs[sargs.size() - 3];
+    sargs[sargs.size() - 2] = "B_ptr=" + sargs[sargs.size() - 2];
+    sargs[sargs.size() - 1] = "C_ptr=" + sargs[sargs.size() - 1];
+  }
+
+  if (ret_dtype.is_fixed_length_vector()) {
+    // maybe simplify this if TensorSSA suppports this OP
+    std::string sret = name_supply_->FreshName("_");
+    PrintIndent();
+    stream << sret << " = tl.make_rmem_tensor((" << ret_dtype.lanes() << ",), ";
+    PrintType(ret_dtype.element_of(), stream);
+    stream << ")\n";
+
+    // Emit a scalar call for each lane.
+    bool has_template_arg = (sargs.size() > args.size() - arg_begin);
+    for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      std::ostringstream scall;
+      scall << global_symbol_str << "(";
+      for (size_t j = 0; j < sargs.size(); ++j) {
+        if (j != 0) {
+          scall << ", ";
+        }
+
+        if (j == 0 && has_template_arg) {
+          scall << sargs[j];
+        } else {
+          PrintVecElemLoad_(
+              sargs[j],
+              args[arg_begin + j - static_cast<size_t>(has_template_arg)]
+                  .dtype(),
+              i, scall);
+        }
+      }
+      if (canonicalized && enable_fastmath_) {
+        if (!sargs.empty()) {
+          scall << ", ";
+        }
+        scall << "fastmath=True";
+      }
+      scall << ")";
+      PrintVecElemStore_(sret, ret_dtype, i, scall.str());
+    }
+    os << sret << ".load()";
+  } else {
+    os << global_symbol_str << "(";
+    for (size_t i = 0; i < sargs.size(); ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+      os << sargs[i];
+    }
+    if (canonicalized && enable_fastmath_) {
+      if (!sargs.empty()) {
+        os << ", ";
+      }
+      os << "fastmath=True";
+    }
+    os << ")";
+  }
+}
+
+std::string CodeGenTileLangCuTeDSL::GetBufferPtr_(const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  const std::string vid = GetVarID(buffer_var);
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  std::string index_str = PrintExpr_(index);
+  return "(" + ptr_str + " + " + index_str + ")";
+}
+
+// The following forms can be returned:
+// (1) vid
+// (2) vid[i]
+// (3) tl.make_tensor_at_offset(...)[0]
+// (4) tl.make_tensor_at_offset(...)
+//
+// Form (4) is needed when the whole tensor is loaded or stored.
+// It's the only form that ends with ")". Using this fact, BufferLoadNode will
+// add ".load()" and BufferStoreNode will add ".store()".
+std::string CodeGenTileLangCuTeDSL::GetBufferRef_(DataType t,
+                                                  const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  std::string scope;
+  if (alloc_storage_scope_.count(buffer_var)) {
+    scope = alloc_storage_scope_.at(buffer_var);
+  }
+  if (scope.empty()) {
+    scope = GetPtrStorageScope(buffer->data);
+  }
+  if (scope == "local.var" || scope.find("local.descriptor") == 0) {
+    return vid;
+  }
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  const std::string index_str = PrintExpr_(index);
+
+  if (t == buffer_element_dtype) {
+    if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
+        (scope == "local" || scope == "shared" || scope == "shared.dyn" ||
+         scope == "shared.barrier")) {
+      // Tensors in these scopes are allocated as one-dimensional, so can be
+      // assessed via "[]" correctly. Other tensors may be multi-dimensional,
+      // and must be assessed via ptr, otherwise CuTeDSL will interpret "[]"
+      // access using its visiting order and layout.
+      return vid + "[" + index_str + "]";
+    } else {
+      std::ostringstream os;
+      os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str
+         << ", (1,), div_by=" << buffer_element_dtype.lanes() << ")";
+      // for vector data types, ".load()" (added by BufferLoadNode) is neeed
+      // instead of "[0]"
+      if (buffer_element_dtype.is_scalar()) {
+        os << "[0]";
+      }
+      return os.str();
+    }
+  } else {
+    const int num = t.bits() * t.lanes();
+    const int den = buffer_element_dtype.bits() * buffer_element_dtype.lanes();
+    ICHECK_EQ(num % den, 0) << "Cannot form view: bitwidth not divisible";
+    int buffer_size = num / den;
+
+    std::ostringstream os;
+    os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str << ", ("
+       << buffer_size << ",), div_by=" << buffer_size << ")";
+    return os.str();
+  }
+}
+
+void CodeGenTileLangCuTeDSL::BindThreadIndex_(const IterVar &iv) {
+  ICHECK(!var_idmap_.count(iv->var.get()));
+
+  auto &thread_tag = iv->thread_tag;
+  ICHECK(thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+         thread_tag == "threadIdx.z" || thread_tag == "blockIdx.x" ||
+         thread_tag == "blockIdx.y" || thread_tag == "blockIdx.z");
+
+  // cute.arch.thread_idx() and block_idx() are Int32
+  DataType from_dtype = DataType::Int(32);
+  var_idmap_[iv->var.get()] =
+      CastFromTo_(thread_tag, from_dtype, iv->var.dtype());
+}
+
+void CodeGenTileLangCuTeDSL::PrintStorageSync_(const CallNode *op) {
+  auto args = op->args;
+  const std::string &sync = args[0].as<StringImmNode>()->value;
+  if (sync == "warp") {
+    // do nothing
+  } else if (sync == "shared" || sync == "shared.dyn") {
+    PrintIndent();
+    if (args.size() == 1) {
+      stream << "tl.sync_threads()\n";
+    } else if (args.size() == 2) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ")\n";
+    } else if (args.size() == 3) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto thread_count_ptr = args[2].as<IntImmNode>();
+      ICHECK(thread_count_ptr)
+          << "storage_sync thread_count (args[2]) must be IntImm, got "
+          << args[2]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      auto thread_count = thread_count_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ", " << thread_count
+             << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid number of arguments for storage sync: "
+                 << args.size();
+    }
+  } else if (sync == "global") {
+    LOG(FATAL) << "PrintStorageSync_ for global is not supported for now";
+  } else {
+    LOG(FATAL) << "Unknown storage sync scope: " << sync;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_cutedsl.h b/src/target/codegen_cutedsl.h
new file mode 100644
index 000000000..1d4edc538
--- /dev/null
+++ b/src/target/codegen_cutedsl.h
@@ -0,0 +1,102 @@
+/*!
+ * \file target/codegen_cutedsl.h
+ * \brief Utility to generate CuTeDSL code
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+#define TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+
+#include <tvm/target/codegen.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "codegen_py.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
+public:
+  CodeGenTileLangCuTeDSL();
+
+protected:
+  void PrintFuncDecorator_(std::ostream &os) override; // NOLINT(*)
+  void PreFunctionBody_(const PrimFunc &f) override;
+
+protected:
+  void PrintType(DataType t, std::ostream &os) override; // NOLINT(*)
+
+  void VisitExpr_(const BroadcastNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override;                     // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+
+protected:
+  virtual void PrintVecElemLoad_(const std::string &vec, DataType t, int i,
+                                 std::ostream &os); // NOLINT(*)
+  virtual void PrintVecElemStore_(const std::string &vec, DataType t, int i,
+                                  const std::string &value);
+  virtual void PrintVecStore_(const BufferNode *buffer, DataType t,
+                              PrimExpr base, const std::string &value);
+  void PrintVecBinaryOp_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                         PrimExpr rhs,
+                         std::ostream &os); // NOLINT(*)
+  void PrintBinaryExpr_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                        PrimExpr rhs,
+                        std::ostream &os) override; // NOLINT(*)
+  void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                             std::ostream &os) override; // NOLINT(*)
+
+  void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                        const ffi::Array<PrimExpr> &args, bool skip_first_arg,
+                        std::ostream &os) override; // NOLINT(*)
+
+  std::string GetBufferPtr_(const BufferNode *buffer, PrimExpr index);
+  std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                            PrimExpr index) override;
+
+  /*!
+   * \brief Print expr representing the thread tag
+   * \param IterVar iv The thread index to be binded;
+   */
+  virtual void BindThreadIndex_(const IterVar &iv); // NOLINT(*)
+
+  virtual void PrintStorageSync_(const CallNode *op);
+
+  std::string
+  CanonicalizeFastmathFunctionName_(const std::string &func_name) const;
+
+private:
+  // The name of the mbarrier array in shared memory
+  const std::string mbarrier_name_ = "mbarrier";
+
+  std::unordered_map<const VarNode *, IntImm> unroll_factor_;
+
+  std::vector<std::string> eviction_policy_names_ = {
+      "EVICT_NORMAL", "EVICT_FIRST", "EVICT_LAST"};
+
+  // Fastmath configuration (read from PassContext)
+  bool enable_fastmath_ = false;
+};
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 7ac2555dc..5c477f9a0 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -37,17 +37,11 @@ static std::string GetFP8Type(DataType type) {
     LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16) "
                   "for FP8";
   }
-  if (type.code() == DataType::kFloat8_e4m3fn) {
+  if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
+      type.is_float8_e4m3() || type.code() == DataType::kFloat8_e4m3b11fnuz) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3fnuz) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3b11fnuz) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e5m2) {
-    stream << "fp8_e5" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e5m2fnuz) {
+  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
+             type.code() == DataType::kFloat8_e5m2) {
     stream << "fp8_e5" << vec << "_t";
   } else if (type.code() == DataType::kFloat8_e8m0fnu) {
     stream << "fp8_e8" << vec << "_t";
@@ -767,24 +761,26 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     }
     this->stream << ");\n";
   };
-  if (op->op.same_as(builtin::ptx_cp_async())) {
+  if (op->op.same_as(builtin::ptx_cp_async()) ||
+      op->op.same_as(tl::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
     std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->PrintIndent();
-      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << "+"
-                   << dst_offset << ", " << src << "+" << src_offset << ");\n";
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      std::string condition = this->PrintExpr(op->args[5]);
-      this->PrintIndent();
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
-                   << "+" << dst_offset << ", " << src << "+" << src_offset
-                   << ", " << condition << ");\n";
+                   << ", " << src << ", " << condition << ");\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl::cp_async_commit");
@@ -828,6 +824,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);
@@ -931,7 +937,13 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
         {"bfloat16x4", "bfloat16x4_vec"},
         {"float32x4", "float32x4"},
         {"float8_e4m3fnuzx4", "fp8_e4_4_t"},
+        {"float8_e4m3fnx4", "fp8_e4_4_t"},
         {"float8_e4m3fnuzx8", "long"},
+        {"float8_e4m3fnx8", "long"},
+        {"float8_e5m2fnuzx4", "fp8_e5_4_t"},
+        {"float8_e5m2fnuzx8", "long"},
+        {"float8_e5m2x4", "fp8_e5_4_t"},
+        {"float8_e5m2x8", "long"},
         {"float32x16", "float32x16"}};
     std::string call_mfma_code = R"({
       *((({C_dtype}*){c_ref}) + {c_bias}) = {mfma_buildin}(*((({A_dtype}*){a_ref}) + {a_bias}),
@@ -970,6 +982,105 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     // HIP doesn't need explicit register management like CUDA
     // This is a no-op for HIP
     return;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::atomic_add_elem_op())) {
+    // atomic_add_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAdd(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_add_ret_elem_op())) {
+    // atomic_add_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicAddRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_addx2_elem_op())) {
+    // atomic_addx2_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx2(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_addx4_elem_op())) {
+    // atomic_addx4_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx4(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_load_elem_op())) {
+    // atomic_load_elem_op(src_ptr, memory_order) -> returns loaded value
+    os << "AtomicLoad(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_store_elem_op())) {
+    // atomic_store_elem_op(dst_ptr, value, memory_order)
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string value = PrintExpr(op->args[1]);
+    std::string memory_order = PrintExpr(op->args[2]);
+    this->PrintIndent();
+    this->stream << "AtomicStore(" << dst_ptr << ", " << value << ", "
+                 << memory_order << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_elem_op())) {
+    // atomic_max_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMax(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_ret_elem_op())) {
+    // atomic_max_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMaxRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_min_elem_op())) {
+    // atomic_min_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMin(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_min_ret_elem_op())) {
+    // atomic_min_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMinRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -1190,9 +1301,9 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
       if (op->value < 0) {
         temp << "-";
       }
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_INF_F" : "HIPRT_INF");
+      temp << ((op->dtype.bits() == 32) ? "HUGE_VALF" : "HUGE_VAL");
     } else if (std::isnan(op->value)) {
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_NAN_F" : "HIPRT_NAN");
+      temp << ((op->dtype.bits() == 32) ? "NAN" : "NAN");
     } else {
       temp << std::scientific << op->value;
       if (op->dtype.bits() == 32)
@@ -1312,6 +1423,12 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol.has_value())
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -1346,7 +1463,7 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_py.cc b/src/target/codegen_py.cc
new file mode 100644
index 000000000..6e0b787bb
--- /dev/null
+++ b/src/target/codegen_py.cc
@@ -0,0 +1,716 @@
+/*!
+ * \file codegen_py.cc
+ */
+#include "codegen_py.h"
+#include "codegen_utils.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ir/name_supply.h>
+
+#include <cctype>
+
+namespace tvm {
+namespace codegen {
+
+void CodeGenTileLangPY::AddFunction(const GlobalVar &gvar, const PrimFunc &f) {
+  RegisterFunction_(gvar, f);
+  auto function_name = GetFunctionName_(gvar);
+
+  // clear previous generated state.
+  InitFuncState_(f);
+
+  PrintFuncDecorator_(stream);
+  PrintFunctionSignature_(function_name, f, stream);
+  stream << ":\n";
+
+  int func_scope = BeginScope();
+  PreFunctionBody_(f);
+  PrintStmt_(f->body);
+  EndScope(func_scope);
+}
+
+std::string CodeGenTileLangPY::Finish() {
+  std::ostringstream code;
+  code << decl_stream.str();
+  code << stream.str();
+  return code.str();
+}
+
+ffi::String CodeGenTileLangPY::GetFunctionName_(const GlobalVar &gvar) {
+  auto it = internal_functions_.find(gvar);
+  ICHECK(it != internal_functions_.end())
+      << "Attempted to find name of " << gvar
+      << ", but no function with this GlobalVar has been declared";
+  return it->second;
+}
+
+void CodeGenTileLangPY::RegisterFunction_(const GlobalVar &gvar,
+                                          const PrimFunc &func) {
+  if (internal_functions_.count(gvar)) {
+    return;
+  }
+
+  auto function_name = [&]() -> ffi::String {
+    if (auto global_symbol =
+            func->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol)) {
+      auto name = global_symbol.value();
+      ICHECK(!func_name_supply_->ContainsName(name))
+          << "Function " << gvar << " must use global symbol " << name
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(name);
+      return name;
+    } else {
+      ICHECK(!func_name_supply_->ContainsName(gvar->name_hint))
+          << "Function " << gvar << " must use name hint " << gvar->name_hint
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(gvar->name_hint);
+      return gvar->name_hint;
+    }
+  }();
+  internal_functions_.insert({gvar, function_name});
+}
+
+void CodeGenTileLangPY::InitFuncState_(const PrimFunc &f) {
+  alloc_storage_scope_.clear();
+  handle_data_type_.clear();
+  CodeGenSourceBase::ClearFuncState();
+  ReserveKeywordsAsUnique_();
+}
+
+void CodeGenTileLangPY::PrintFunctionSignature_(
+    const ffi::String &function_name, const PrimFunc &func,
+    std::ostream &os) { // NOLINT(*)
+  os << "def " << function_name << "(";
+  for (size_t i = 0; i < func->params.size(); ++i) {
+    tir::Var v = func->params[i];
+    if (i > 0) {
+      os << ", ";
+    }
+    os << AllocVarID(v.get());
+  }
+  os << ")";
+
+  // Register handle data type
+  for (const auto &param : func->params) {
+    if (auto *ptr = param->type_annotation.as<PointerTypeNode>()) {
+      if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
+        RegisterHandleType_(param.get(), prim->dtype);
+      }
+    }
+  }
+}
+
+void CodeGenTileLangPY::ReserveKeywordsAsUnique_() {
+  // skip the first underscore, so SSA variable starts from _1
+  name_supply_->ReserveName("_");
+  name_supply_->ReserveName("False");
+  name_supply_->ReserveName("None");
+  name_supply_->ReserveName("True");
+  name_supply_->ReserveName("and");
+  name_supply_->ReserveName("as");
+  name_supply_->ReserveName("assert");
+  name_supply_->ReserveName("async");
+  name_supply_->ReserveName("await");
+  name_supply_->ReserveName("break");
+  name_supply_->ReserveName("class");
+  name_supply_->ReserveName("continue");
+  name_supply_->ReserveName("def");
+  name_supply_->ReserveName("del");
+  name_supply_->ReserveName("elif");
+  name_supply_->ReserveName("else");
+  name_supply_->ReserveName("except");
+  name_supply_->ReserveName("finally");
+  name_supply_->ReserveName("for");
+  name_supply_->ReserveName("from");
+  name_supply_->ReserveName("global");
+  name_supply_->ReserveName("if");
+  name_supply_->ReserveName("import");
+  name_supply_->ReserveName("in");
+  name_supply_->ReserveName("is");
+  name_supply_->ReserveName("lambda");
+  name_supply_->ReserveName("nonlocal");
+  name_supply_->ReserveName("not");
+  name_supply_->ReserveName("or");
+  name_supply_->ReserveName("pass");
+  name_supply_->ReserveName("raise");
+  name_supply_->ReserveName("return");
+  name_supply_->ReserveName("try");
+  name_supply_->ReserveName("while");
+  name_supply_->ReserveName("with");
+  name_supply_->ReserveName("yield");
+
+  name_supply_->ReserveName("void");
+  name_supply_->ReserveName("int");
+  name_supply_->ReserveName("float");
+  name_supply_->ReserveName("double");
+  name_supply_->ReserveName("char");
+  name_supply_->ReserveName("unsigned");
+  name_supply_->ReserveName("short");
+  name_supply_->ReserveName("long");
+
+  name_supply_->ReserveName("cutlass");
+  name_supply_->ReserveName("cute");
+  name_supply_->ReserveName("tl");
+}
+
+void CodeGenTileLangPY::PrintSSAAssign(const std::string &target,
+                                       const std::string &src, DataType t) {
+  PrintIndent();
+  stream << target << " = " << RemoveOutermostParentheses(src) << "\n";
+}
+
+void CodeGenTileLangPY::PrintType(DataType type,
+                                  std::ostream &os) { // NOLINT(*)
+  if (type.is_float()) {
+    if (type.bits() == 16 || type.bits() == 32 || type.bits() == 64) {
+      os << "float";
+    } else {
+      LOG(FATAL) << "Cannot convert float" << type.bits() << " to Python type";
+    }
+  } else if (type.is_uint()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert uint" << type.bits() << " to Python type";
+    }
+  } else if (type.is_int()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert int" << type.bits() << " to Python type";
+    }
+  } else {
+    LOG(FATAL) << "Cannot convert type " << type << " to Python type";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const VarNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << GetVarID(op);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const IntImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype == DataType::Bool()) {
+    os << (op->value ? "True" : "False");
+  } else {
+    std::ostringstream temp;
+    temp << op->value;
+    MarkConst(temp.str());
+    os << temp.str();
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const FloatImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32: {
+    std::ostringstream temp;
+    temp << "float.fromhex('" << std::hexfloat << op->value << "')";
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  case 16: {
+    PrintType(op->dtype, os);
+    os << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const StringImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  EscapeStringLiteral_(op->value, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CastNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  std::stringstream value;
+  PrintExpr_(op->value, value);
+  os << CastFromTo_(value.str(), op->value.dtype(), op->dtype);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const AddNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("+", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const SubNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("-", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MulNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("*", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const DivNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    PrintBinaryExpr_("/", op->dtype, op->a, op->b, os);
+  }
+}
+void CodeGenTileLangPY::VisitExpr_(const ModNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK(op->dtype.is_int() || op->dtype.is_uint() || op->dtype.is_float())
+      << "Expected floating point or integer dtype in Mod, but got "
+      << op->dtype;
+  PrintBinaryExpr_("%", op->dtype, op->a, op->b, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const MinNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MaxNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("max", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const EQNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("==", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("!=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const AndNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("and", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const OrNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("or", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NotNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(not ";
+  PrintExpr_(op->a, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const SelectNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(";
+  PrintExpr_(op->true_value, os);
+  os << " if ";
+  PrintExpr_(op->condition, os);
+  os << " else ";
+  PrintExpr_(op->false_value, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const RampNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  int lanes = op->dtype.lanes();
+  os << "(";
+  for (int i = 0; i < lanes; i++) {
+    os << "(" << PrintExpr_(op->base) << ")"
+       << "+(" << PrintExpr_(op->stride) << "*" << i << ")";
+    if (i != lanes - 1)
+      os << ", ";
+  }
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CallNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (auto opt_call_op = op->op.as<Op>()) {
+    const auto &call_op = opt_call_op.value();
+
+    if (op->op.same_as(builtin::ret())) {
+      os << "return " << RemoveOutermostParentheses(PrintExpr_(op->args[0]));
+    } else if (op->op.same_as(builtin::continue_loop())) {
+      os << "continue";
+    } else if (op->op.same_as(builtin::break_loop())) {
+      os << "break";
+    } else if (op->op.same_as(builtin_call_extern_) ||
+               op->op.same_as(builtin_call_pure_extern_)) {
+      ICHECK_GE(op->args.size(), 1U);
+      auto func = Downcast<StringImm>(op->args[0]);
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), func->value,
+                       op->args, true, os);
+    } else if (op_attr_global_symbol_.count(call_op)) {
+      // call extern if the op itself have a global symbol.
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)),
+                       op_attr_global_symbol_[call_op], op->args, false, os);
+    } else if (op->op.same_as(builtin::large_uint_imm())) {
+      ICHECK_EQ(op->args.size(), 2U);
+      uint64_t low =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
+      uint64_t high =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
+      uint64_t val = (high << 32U) | low;
+
+      if (op->dtype == DataType::UInt(32)) {
+        std::ostringstream temp;
+        temp << val;
+        MarkConst(temp.str());
+        os << temp.str();
+      } else {
+        PrintType(op->dtype, os);
+        os << "(" << val << ")";
+      }
+    } else if (op->op.same_as(builtin::bitwise_and())) {
+      PrintBinaryIntrinsic_(op, "&", os);
+    } else if (op->op.same_as(builtin::bitwise_or())) {
+      PrintBinaryIntrinsic_(op, "|", os);
+    } else if (op->op.same_as(builtin::bitwise_xor())) {
+      PrintBinaryIntrinsic_(op, "^", os);
+    } else if (op->op.same_as(builtin::bitwise_not())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "~";
+      PrintExpr_(op->args[0], os);
+    } else if (op->op.same_as(builtin::shift_left())) {
+      PrintBinaryIntrinsic_(op, "<<", os);
+    } else if (op->op.same_as(builtin::shift_right())) {
+      PrintBinaryIntrinsic_(op, ">>", os);
+    } else if (op->op.same_as(builtin::if_then_else())) {
+
+      std::string cond = PrintExpr_(op->args[0]);
+      std::string true_val = PrintExpr_(op->args[1]);
+      std::string false_val = PrintExpr_(op->args[2]);
+      os << "(" << true_val << " if " << cond << " else " << false_val << ")";
+    } else if (op->op.same_as(builtin::isnullptr())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " is None)";
+    } else if (op->op.same_as(builtin::isnan())) {
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " != ";
+      PrintExpr_(op->args[0], os);
+      os << ")";
+    } else {
+      LOG(FATAL) << "Unresolved call " << op->op;
+    }
+  } else if (auto opt = op->op.as<GlobalVar>()) {
+    const auto &gvar = opt.value();
+    auto callee_name = GetFunctionName_(gvar);
+    PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), callee_name, op->args,
+                     false, os);
+  } else {
+    LOG(FATAL)
+        << "CodeGenTileLangPY: Unknown operation " << op->op
+        << " is neither a recognized built-in, "
+        << "nor a GlobalVar reference to another function in the IRModule";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const BufferLoadNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferLoadNode";
+  std::string ref = GetBufferRef_(op->dtype, op->buffer.get(), index);
+  os << ref;
+}
+
+void CodeGenTileLangPY::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferStoreNode";
+  std::string value = PrintExpr_(op->value);
+  std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+  PrintIndent();
+  stream << ref << " = " << RemoveOutermostParentheses(value) << "\n";
+}
+
+void CodeGenTileLangPY::VisitStmt_(const DeclBufferNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const LetStmtNode *op) {
+  std::string value = PrintExpr_(op->value);
+  PrintIndent();
+  stream << AllocVarID(op->var.get()) << " = " << value << "\n";
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+
+  PrintIndent();
+  size_t constant_size = op->ConstantAllocationSize();
+  ICHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+
+  auto scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  stream << vid << " = [None] * " << constant_size << "\n";
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AttrStmtNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const ForNode *op) {
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in range(";
+  if (is_zero(op->min)) {
+    PrintExpr_(op->extent, stream);
+  } else {
+    PrintExpr_(op->min, stream);
+    stream << ", ";
+    PrimExpr upper_bound = arith::Analyzer().Simplify(op->extent + op->min);
+    PrintExpr_(upper_bound, stream);
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const WhileNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "while " << RemoveOutermostParentheses(cond) << ":\n";
+  int while_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(while_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  PrintStmt_(op->then_case);
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const SeqStmtNode *op) {
+  for (Stmt stmt : op->seq) {
+    PrintStmt_(stmt);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+
+  std::string vid = PrintExpr_(op->value);
+  if (!vid.empty()) {
+    PrintIndent();
+    stream << vid << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AssertStmtNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  if (const auto *str = op->message.as<StringImmNode>()) {
+    stream << "assert " << cond << ", ";
+    EscapeStringLiteral_(str->value, stream);
+    stream << "\n";
+  } else {
+    stream << "assert " << cond << "\n";
+  }
+  PrintStmt_(op->body);
+}
+
+std::string CodeGenTileLangPY::CastFromTo_(const std::string &value,
+                                           DataType from, DataType target) {
+  if (from == target)
+    return value;
+  std::ostringstream os;
+  PrintType(target, os);
+  os << "(" << value << ")";
+  return os.str();
+}
+
+void CodeGenTileLangPY::PrintBinaryExpr_(const std::string &opstr,
+                                         DataType dtype, PrimExpr lhs,
+                                         PrimExpr rhs,
+                                         std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(dtype.lanes(), 1);
+  if (isalpha(opstr[0]) && opstr != "and" && opstr != "or") {
+    os << opstr << '(';
+    PrintExpr_(lhs, os);
+    os << ", ";
+    PrintExpr_(rhs, os);
+    os << ')';
+  } else {
+    os << '(';
+    PrintExpr_(lhs, os);
+    os << ' ' << opstr << ' ';
+    PrintExpr_(rhs, os);
+    os << ')';
+  }
+}
+
+void CodeGenTileLangPY::PrintBinaryIntrinsic_(const CallNode *op,
+                                              const char *opstr,
+                                              std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->dtype.lanes(), 1);
+  ICHECK_EQ(op->args.size(), 2U);
+  os << '(';
+  PrintExpr_(op->args[0], os);
+  os << ' ' << opstr << ' ';
+  PrintExpr_(op->args[1], os);
+  os << ')';
+}
+
+void CodeGenTileLangPY::PrintCallExtern_(Type ret_type,
+                                         ffi::String global_symbol,
+                                         const ffi::Array<PrimExpr> &args,
+                                         bool skip_first_arg,
+                                         std::ostream &os) { // NOLINT(*)
+  os << global_symbol << "(";
+  for (size_t i = static_cast<size_t>(skip_first_arg); i < args.size(); ++i) {
+    PrintExpr_(args[i], os);
+    if (i < args.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << ")";
+}
+
+// Print a reference expression to a buffer.
+std::string CodeGenTileLangPY::GetBufferRef_(DataType t,
+                                             const BufferNode *buffer,
+                                             PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  DataType buffer_element_dtype = buffer->dtype;
+
+  ICHECK(HandleTypeMatch_(buffer_var, buffer_element_dtype));
+  ICHECK_EQ(t, buffer_element_dtype);
+
+  std::string index_str = PrintExpr_(index);
+  return vid + "[" + index_str + "]";
+}
+
+void CodeGenTileLangPY::RegisterHandleType_(const VarNode *buf_var,
+                                            DataType t) {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end()) {
+    handle_data_type_[buf_var] = t;
+  } else {
+    ICHECK(it->second == t) << "conflicting buf var type";
+  }
+}
+
+bool CodeGenTileLangPY::HandleTypeMatch_(const VarNode *buf_var,
+                                         DataType t) const {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end())
+    return false;
+  return it->second == t;
+}
+
+void CodeGenTileLangPY::EscapeStringLiteral_(const std::string &s,
+                                             std::ostream &os) {
+  os << '"';
+  for (unsigned char c : s) {
+    switch (c) {
+    case '\\':
+      os << "\\\\";
+      break;
+    case '"':
+      os << "\\\"";
+      break;
+    case '\n':
+      os << "\\n";
+      break;
+    case '\r':
+      os << "\\r";
+      break;
+    case '\t':
+      os << "\\t";
+      break;
+    case '\f':
+      os << "\\f";
+      break;
+    case '\b':
+      os << "\\b";
+      break;
+    default:
+      // Handle non-printable and non-ASCII characters
+      if (c < 32 || c == 127) {
+        // Output as \xHH
+        os << "\\x";
+        const char hex[] = "0123456789abcdef";
+        os << hex[(c >> 4) & 0xF];
+        os << hex[c & 0xF];
+      } else {
+        os << c;
+      }
+      break;
+    }
+  }
+  os << '"';
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_py.h b/src/target/codegen_py.h
new file mode 100644
index 000000000..431fe933d
--- /dev/null
+++ b/src/target/codegen_py.h
@@ -0,0 +1,255 @@
+/*!
+ * \file codegen_py.h
+ * \brief Common utilities to generate simple Python code.
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_PY_H_
+#define TVM_TL_TARGET_CODEGEN_PY_H_
+
+#include <tvm/ir/op.h>
+#include <tvm/target/codegen.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+
+// from tvm/src/
+#include "target/source/codegen_source_base.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+using namespace tir;
+/*!
+ * \brief A base class to generate simple Python code.
+ */
+class CodeGenTileLangPY
+    : public ExprFunctor<void(const PrimExpr &, std::ostream &)>,
+      public StmtFunctor<void(const Stmt &)>,
+      public CodeGenSourceBase {
+public:
+  /*!
+   * \brief Add the function definition to the generated module.
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   */
+  virtual void AddFunction(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The code.
+   */
+  virtual std::string Finish();
+
+protected:
+  /*!
+   * \brief Get the name of a declared function
+   * \param gvar The GlobalVar of the function
+   * \returns The string name of the function
+   */
+  ffi::String GetFunctionName_(const GlobalVar &gvar);
+
+  /*!
+   * \brief Reserve the function name in the generated module.
+   *
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   * \param whether to append return 0 in the end.
+   */
+  virtual void RegisterFunction_(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Initialize codegen state for generating f.
+   * \param f The function to be compiled.
+   */
+  virtual void InitFuncState_(const PrimFunc &f);
+
+  /*! \brief Print the function signature before ":"
+   * \param function_name The name of the function
+   * \param func The function whose signature should be printed
+   * \param os The output stream
+   */
+  virtual void PrintFunctionSignature_(const ffi::String &function_name,
+                                       const PrimFunc &func,
+                                       std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print the function decorator
+   * \param os The output stream
+   */
+  virtual void PrintFuncDecorator_(std::ostream &os) {} // NOLINT(*)
+
+  /*!
+   * \brief Insert statement before function body.
+   * \param f The function to be compiled.
+   */
+  virtual void PreFunctionBody_(const PrimFunc &f) {}
+
+protected:
+  /*! \brief reserves common Python keywords */
+  void ReserveKeywordsAsUnique_();
+
+  void PrintSSAAssign(const std::string &target, const std::string &src,
+                      DataType t) override;
+
+protected:
+  /*!
+   * \brief Print Type representation of type type.
+   * \param t The type representation.
+   * \param os The output stream
+   */
+  void PrintType(DataType type, std::ostream &os) override; // NOLINT(*)
+
+  /*!
+   * \brief Print the Stmt n to CodeGenTileLangPY->stream
+   * \param n The statement to be printed.
+   */
+  void PrintStmt_(const Stmt &n) { VisitStmt(n); }
+  /*!
+   * \brief Print the expression n into os
+   * \param n The expression to be printed.
+   * \param os The output stream
+   */
+  void PrintExpr_(const PrimExpr &n, std::ostream &os) { // NOLINT(*)
+    VisitExpr(n, os);
+  }
+  /*!
+   * \brief Same as PrintExpr_, but simply returns result string
+   * \param n The expression to be printed.
+   */
+  std::string PrintExpr_(const PrimExpr &n) {
+    std::ostringstream os;
+    PrintExpr_(n, os);
+    return os.str();
+  }
+
+  // expression
+  void VisitExpr_(const VarNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const IntImmNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const StringImmNode *op,
+                  std::ostream &os) override;                       // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const AddNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SubNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MulNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const ModNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const EQNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const AndNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const OrNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NotNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const RampNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  // statment
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const DeclBufferNode *op) override;
+  void VisitStmt_(const LetStmtNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const WhileNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const SeqStmtNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+  void VisitStmt_(const AssertStmtNode *op) override;
+
+protected:
+  // Get a string of type casting
+  virtual std::string CastFromTo_(const std::string &value, DataType from,
+                                  DataType target);
+
+  virtual void PrintBinaryExpr_(const std::string &opstr, DataType dtype,
+                                PrimExpr lhs, PrimExpr rhs,
+                                std::ostream &os); // NOLINT(*)
+  virtual void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                                     std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print external function call.
+   * \param ret_type The return type.
+   * \param global_symbol The symbolc of the target function.
+   * \param args The arguments to the function.
+   * \param skip_first_arg Whether to skip the first arguments.
+   * \param os The output stream.
+   */
+  virtual void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                                const ffi::Array<PrimExpr> &args,
+                                bool skip_first_arg,
+                                std::ostream &os); // NOLINT(*)
+
+  // Print reference to a buffer as type t in index.
+  virtual std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                                    PrimExpr index);
+
+  /*!
+   * \brief Register the data type of buf_var
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  void RegisterHandleType_(const VarNode *buf_var, DataType t);
+
+  /*!
+   * \brief If buffer is allocated as type t.
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  bool HandleTypeMatch_(const VarNode *buf_var, DataType t) const;
+
+protected:
+  /*! \brief the storage scope of allocation */
+  std::unordered_map<const VarNode *, std::string> alloc_storage_scope_;
+
+  /*! \brief Record of ops that have pre-defined global symbol. */
+  OpAttrMap<TGlobalSymbol> op_attr_global_symbol_ =
+      Op::GetAttrMap<TGlobalSymbol>("TGlobalSymbol");
+
+  // cache commonly used ops
+  const Op &builtin_call_extern_ = builtin::call_extern();
+  const Op &builtin_call_pure_extern_ = builtin::call_pure_extern();
+
+private:
+  /*! \brief the data type of allocated buffers */
+  std::unordered_map<const VarNode *, DataType> handle_data_type_;
+
+  /* \brief Map of GlobalVar to their symbol.
+   *
+   * For externally-exposed functions, this is given by the
+   * tvm::attr::kTarget attribute of the PrimFunc.  For internal
+   * functions, this is the name of the function's GlobalVar, possibly
+   * altered to prevent duplicate names.
+   */
+  std::unordered_map<GlobalVar, ffi::String> internal_functions_;
+
+  /* \brief Name supply to generate unique function names */
+  NameSupply func_name_supply_;
+
+  /*!
+   * \brief Escape a string to be a valid Python double-quoted string literal.
+   * \param s The input string to escape.
+   * \param os The output stream to write the escaped string to.
+   */
+  void EscapeStringLiteral_(const std::string &s, std::ostream &os);
+};
+
+} // namespace codegen
+} // namespace tvm
+#endif // TVM_TL_TARGET_CODEGEN_PY_H_
diff --git a/src/target/codegen_utils.cc b/src/target/codegen_utils.cc
new file mode 100644
index 000000000..75d038d3a
--- /dev/null
+++ b/src/target/codegen_utils.cc
@@ -0,0 +1,41 @@
+/*!
+ * \file target/codegen_utils.cc
+ * \brief Shared utility functions for code generation
+ */
+
+#include "codegen_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+bool CheckOutermostParenthesesMatch(const std::string &s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched < 0) {
+        return false;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
+std::string RemoveOutermostParentheses(const std::string &s) {
+  if (CheckOutermostParenthesesMatch(s)) {
+    return s.substr(1, s.size() - 2);
+  } else {
+    return s;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_utils.h b/src/target/codegen_utils.h
new file mode 100644
index 000000000..1ef52d4b1
--- /dev/null
+++ b/src/target/codegen_utils.h
@@ -0,0 +1,33 @@
+/*!
+ * \file target/codegen_utils.h
+ * \brief Shared utility functions for code generation
+ */
+
+#ifndef TVM_TARGET_CODEGEN_UTILS_H_
+#define TVM_TARGET_CODEGEN_UTILS_H_
+
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Check if the outermost parentheses match
+ * \param s The input string
+ * \return true if the first character is '(' and the last character is ')'
+ *         and they form a matching pair
+ */
+bool CheckOutermostParenthesesMatch(const std::string &s);
+
+/*!
+ * \brief Remove outermost parentheses if they match
+ * \param s The input string
+ * \return The string with outermost parentheses removed if they match,
+ *         otherwise return the original string
+ */
+std::string RemoveOutermostParentheses(const std::string &s);
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TARGET_CODEGEN_UTILS_H_
diff --git a/src/target/intrin_rule_cuda.cc b/src/target/intrin_rule_cuda.cc
index 1aacd7204..e3186c713 100644
--- a/src/target/intrin_rule_cuda.cc
+++ b/src/target/intrin_rule_cuda.cc
@@ -118,7 +118,8 @@ struct CUDAWarpIntrinsic {
 
 static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
   const CallNode *call = e.as<CallNode>();
-  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
+  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args,
+              call->annotations);
 }
 
 template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
@@ -127,7 +128,8 @@ template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
   ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
   Array<PrimExpr> cuda_args{
       {call->args[0], call->args[1], call->args[2], call->args[3]}};
-  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
+  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args,
+              call->annotations);
 }
 
 TVM_REGISTER_OP("tir.rsqrt")
diff --git a/src/target/ptx.cc b/src/target/ptx.cc
index 53f83ded9..9bf7c5a2e 100644
--- a/src/target/ptx.cc
+++ b/src/target/ptx.cc
@@ -74,9 +74,11 @@ DataType DTypeFromString(const std::string str) {
     return DataType::kInt64;
   } else if (str == "uint64" || str == ".u64") {
     return DataType::kUInt64;
-  } else if (str == "float8_e4m3" || str == "e4m3" || str == ".e4m3") {
+  } else if (str == "float8_e4m3" || str == "float8_e4m3fn" || str == "e4m3" ||
+             str == ".e4m3") {
     return DataType::kFloat8_e4m3;
-  } else if (str == "float8_e5m2" || str == "e5m2" || str == ".e5m2") {
+  } else if (str == "float8_e5m2" || str == "float8_e5m2fn" || str == "e5m2" ||
+             str == ".e5m2") {
     return DataType::kFloat8_e5m2;
   } else if (str == "float16" || str == "fp16" || str == ".f16") {
     return DataType::kFloat16;
@@ -1341,9 +1343,13 @@ std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
 )";
   Replacer replacer;
   replacer.register_rule("{smem_addr}",
-                         shared_ptr + " + " + shared_elem_offset);
+                         shared_elem_offset.empty()
+                             ? shared_ptr
+                             : shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}",
-                         global_ptr + " + " + global_elem_offset);
+                         global_elem_offset.empty()
+                             ? global_ptr
+                             : global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
   replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   asm_code = replacer.rewrite(asm_code);
@@ -1396,9 +1402,13 @@ std::string PrintPredicatedCpAsyncAssembly(
 
   Replacer replacer;
   replacer.register_rule("{smem_addr}",
-                         shared_ptr + " + " + shared_elem_offset);
+                         shared_elem_offset.empty()
+                             ? shared_ptr
+                             : shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}",
-                         global_ptr + " + " + global_elem_offset);
+                         global_elem_offset.empty()
+                             ? global_ptr
+                             : global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
   replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   replacer.register_rule("{store_shared}", store_shared);
diff --git a/src/target/ptx.h b/src/target/ptx.h
index 566cded6f..85d9b947b 100644
--- a/src/target/ptx.h
+++ b/src/target/ptx.h
@@ -193,9 +193,11 @@ std::string PrintLoadMatrixAssembly(bool trans, int num,
 /*!
  * \brief Print ptx cp.async assembly string given parameters.
  * \param shared_ptr: The pointer to the destination shared memory.
- * \param shared_elem_offset: The offset into the shared memory.
+ * \param shared_elem_offset: The offset into the shared memory (empty for no
+ * offset).
  * \param global_ptr: The pointer to the global memory.
- * \param global_elem_offset: The offset into the global memory.
+ * \param global_elem_offset: The offset into the global memory (empty for no
+ * offset).
  * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
  */
 std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
@@ -204,12 +206,27 @@ std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
                                  const std::string &global_elem_offset,
                                  const std::string &bytes);
 
+/*!
+ * \brief Print ptx cp.async assembly string given parameters (no offset
+ * version).
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ */
+inline std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
+                                        const std::string &global_ptr,
+                                        const std::string &bytes) {
+  return PrintCpAsyncAssembly(shared_ptr, "", global_ptr, "", bytes);
+}
+
 /*!
  * \brief Print predicated ptx cp.async assembly string given parameters.
  * \param shared_ptr: The pointer to the destination shared memory.
- * \param shared_elem_offset: The offset into the shared memory.
+ * \param shared_elem_offset: The offset into the shared memory (empty for no
+ * offset).
  * \param global_ptr: The pointer to the global memory.
- * \param global_elem_offset: The offset into the global memory.
+ * \param global_elem_offset: The offset into the global memory (empty for no
+ * offset).
  * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
  * \param predicate_value: The value of predicate `@p`.
  */
@@ -218,6 +235,21 @@ std::string PrintPredicatedCpAsyncAssembly(
     const std::string &global_ptr, const std::string &global_elem_offset,
     const std::string &bytes, const std::string &predicate_value);
 
+/*!
+ * \brief Print predicated ptx cp.async assembly string given parameters (no
+ * offset version).
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ * \param predicate_value: The value of predicate `@p`.
+ */
+inline std::string PrintPredicatedCpAsyncAssembly(
+    const std::string &shared_ptr, const std::string &global_ptr,
+    const std::string &bytes, const std::string &predicate_value) {
+  return PrintPredicatedCpAsyncAssembly(shared_ptr, "", global_ptr, "", bytes,
+                                        predicate_value);
+}
+
 /*!
  * \brief Print ptx async copy from global to shared memory using cp.async.bulk
  * \param shared_ptr: The pointer to the destination shared memory.
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index bb69170fe..56c0d2366 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -1,7 +1,10 @@
+#include "../transform/common/attr.h"
 #include "codegen_cuda.h"
 #include "runtime/cuda/cuda_module.h"
+#include "runtime/meta_data.h"
 #include "runtime/pack_args.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
 
 namespace tvm {
 namespace codegen {
@@ -24,7 +27,19 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
+    }
+    if (f->HasNonzeroAttr(tl::attr::kHasGridSync)) {
+      info.launch_param_tags.push_back(
+          runtime::launch_param::kUseProgramaticDependentLaunch);
+    }
+    if (f->HasNonzeroAttr("use_cooperative_groups")) {
+      info.launch_param_tags.push_back(
+          runtime::launch_param::kUseCooperativeLaunch);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
@@ -62,7 +77,10 @@ ffi::Module BuildTileLangCUDA(IRModule mod, Target target) {
   std::string ptx;
   if (const auto f =
           ffi::Function::GetGlobal("tilelang_callback_cuda_compile")) {
-    ptx = (*f)(code, target).cast<std::string>();
+    // Fetch current pass context config and pass into the compile callback
+    tvm::transform::PassContext pass_ctx =
+        tvm::transform::PassContext::Current();
+    ptx = (*f)(code, target, pass_ctx->config).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "cubin";
   } else {
diff --git a/src/target/rt_mod_cutedsl.cc b/src/target/rt_mod_cutedsl.cc
new file mode 100644
index 000000000..a2b6d05d1
--- /dev/null
+++ b/src/target/rt_mod_cutedsl.cc
@@ -0,0 +1,69 @@
+#include "codegen_cutedsl.h"
+#include "runtime/cuda/cuda_module.h"
+#include "runtime/pack_args.h"
+#include <tvm/ffi/reflection/registry.h>
+
+namespace tvm {
+namespace codegen {
+
+static std::unordered_map<std::string, runtime::FunctionInfo>
+ExtractFuncInfo(const IRModule &mod) {
+  std::unordered_map<std::string, runtime::FunctionInfo> fmap;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    auto f = Downcast<tir::PrimFunc>(kv.second);
+
+    runtime::FunctionInfo info;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      if (f->params[i]->dtype.is_handle()) {
+        auto ptr = f->params[i]->type_annotation.as<PointerTypeNode>();
+        if (ptr && ptr->storage_scope == "grid_constant") {
+          info.arg_types.push_back(DataType(runtime::kDLGridConstant, 64, 1));
+          continue;
+        }
+      }
+      info.arg_types.push_back(f->params[i].dtype());
+    }
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
+      for (const auto &tag : opt.value()) {
+        info.launch_param_tags.push_back(tag);
+      }
+    }
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
+    fmap[static_cast<std::string>(global_symbol.value())] = info;
+  }
+  return fmap;
+}
+
+ffi::Module BuildTileLangCuTeDSLWithoutCompile(IRModule mod, Target target) {
+  CodeGenTileLangCuTeDSL cg;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<PrimFuncNode>())
+        << "CodeGenTileLangCuTeDSL: Can only take PrimFunc";
+    auto gvar = Downcast<GlobalVar>(kv.first);
+    auto f = Downcast<PrimFunc>(kv.second);
+    auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch);
+    cg.AddFunction(gvar, f);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cutedsl_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_cutedsl_without_compile",
+                        BuildTileLangCuTeDSLWithoutCompile);
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index 50991d631..1e5c689c6 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -35,7 +35,11 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
diff --git a/src/target/rt_mod_metal.cc b/src/target/rt_mod_metal.cc
deleted file mode 100644
index 2881075c0..000000000
--- a/src/target/rt_mod_metal.cc
+++ /dev/null
@@ -1,3 +0,0 @@
-// Currently mps backend use the codegen from tvm without modification.
-// But in the future we're likely to add functions on top of that.
-// Added an empty file for now.
diff --git a/src/target/stubs/cuda.cc b/src/target/stubs/cuda.cc
new file mode 100644
index 000000000..249eae76b
--- /dev/null
+++ b/src/target/stubs/cuda.cc
@@ -0,0 +1,268 @@
+/**
+ * \file cuda.cc
+ * \brief Implementation of CUDA driver API stub library.
+ *
+ * This file implements lazy loading of libcuda.so and provides global wrapper
+ * functions that serve as drop-in replacements for the CUDA driver API.
+ *
+ * The library is loaded on first API call using dlopen(). If loading fails
+ * (e.g., on a CPU-only machine), an exception is thrown at call time rather
+ * than at import time, allowing tilelang to be imported without CUDA.
+ */
+
+#include "cuda.h"
+
+#include <dlfcn.h>
+#include <stdexcept>
+#include <string>
+
+namespace tvm::tl::cuda {
+
+namespace {
+
+// Library names to try loading (in order of preference)
+constexpr const char *kLibCudaPaths[] = {
+    "libcuda.so.1", // Versioned library (most common)
+    "libcuda.so",   // Unversioned library
+};
+
+/**
+ * \brief Try to load libcuda.so from various paths.
+ * \return The dlopen handle, or nullptr if loading failed.
+ */
+void *try_load_libcuda() {
+  void *handle = nullptr;
+  for (const char *path : kLibCudaPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      break;
+    }
+  }
+  return handle;
+}
+
+/**
+ * \brief Get symbol from library handle, returning nullptr on failure.
+ */
+template <typename T> T get_symbol(void *handle, const char *name) {
+  // Clear any existing error
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  // Check for error (symbol could legitimately be nullptr in some cases)
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+/**
+ * \brief Create and initialize the CUDADriverAPI singleton.
+ *
+ * This function loads libcuda.so and resolves all function symbols.
+ * Required symbols that are missing will cause an exception.
+ * Optional symbols that are missing will be set to nullptr.
+ *
+ * \return The initialized CUDADriverAPI instance.
+ * \throws std::runtime_error if a required symbol is missing.
+ */
+CUDADriverAPI create_driver_api() {
+  CUDADriverAPI api{};
+  void *handle = CUDADriverAPI::get_handle();
+
+  if (handle == nullptr) {
+    return api;
+  }
+
+// Lookup required symbols - throw if missing
+#define LOOKUP_REQUIRED(name)                                                  \
+  api.name##_ = get_symbol<decltype(&name)>(handle, #name);                    \
+  if (api.name##_ == nullptr) {                                                \
+    const char *error = dlerror();                                             \
+    throw std::runtime_error(                                                  \
+        std::string("Failed to load required CUDA driver symbol: ") + #name +  \
+        ". Error: " + (error ? error : "unknown"));                            \
+  }
+  TILELANG_LIBCUDA_API_REQUIRED(LOOKUP_REQUIRED)
+#undef LOOKUP_REQUIRED
+
+// Lookup optional symbols - set to nullptr if missing (no throw)
+#define LOOKUP_OPTIONAL(name)                                                  \
+  api.name##_ = get_symbol<decltype(&name)>(handle, #name);
+  TILELANG_LIBCUDA_API_OPTIONAL(LOOKUP_OPTIONAL)
+#undef LOOKUP_OPTIONAL
+
+  return api;
+}
+
+} // namespace
+
+void *CUDADriverAPI::get_handle() {
+  // Static handle ensures library is loaded only once
+  static void *handle = try_load_libcuda();
+  return handle;
+}
+
+bool CUDADriverAPI::is_available() { return get_handle() != nullptr; }
+
+CUDADriverAPI *CUDADriverAPI::get() {
+  static CUDADriverAPI singleton = create_driver_api();
+
+  if (!is_available()) {
+    throw std::runtime_error(
+        "CUDA driver library (libcuda.so) not found. "
+        "Please ensure NVIDIA drivers are installed, or use CPU-only mode.");
+  }
+
+  return &singleton;
+}
+
+} // namespace tvm::tl::cuda
+
+// ============================================================================
+// Global wrapper function implementations
+// ============================================================================
+// These are the implementations for the extern "C" functions declared in the
+// header. They provide ABI-compatible replacements for libcuda.so functions.
+
+using tvm::tl::cuda::CUDADriverAPI;
+
+extern "C" {
+
+CUresult cuGetErrorName(CUresult error, const char **pStr) {
+  return CUDADriverAPI::get()->cuGetErrorName_(error, pStr);
+}
+
+CUresult cuGetErrorString(CUresult error, const char **pStr) {
+  return CUDADriverAPI::get()->cuGetErrorString_(error, pStr);
+}
+
+CUresult cuCtxGetDevice(CUdevice *device) {
+  return CUDADriverAPI::get()->cuCtxGetDevice_(device);
+}
+
+CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  return CUDADriverAPI::get()->cuCtxGetLimit_(pvalue, limit);
+}
+
+CUresult cuCtxSetLimit(CUlimit limit, size_t value) {
+  return CUDADriverAPI::get()->cuCtxSetLimit_(limit, value);
+}
+
+CUresult cuCtxResetPersistingL2Cache(void) {
+  return CUDADriverAPI::get()->cuCtxResetPersistingL2Cache_();
+}
+
+CUresult cuDeviceGetName(char *name, int len, CUdevice dev) {
+  return CUDADriverAPI::get()->cuDeviceGetName_(name, len, dev);
+}
+
+CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                              CUdevice dev) {
+  return CUDADriverAPI::get()->cuDeviceGetAttribute_(pi, attrib, dev);
+}
+
+CUresult cuModuleLoadData(CUmodule *module, const void *image) {
+  return CUDADriverAPI::get()->cuModuleLoadData_(module, image);
+}
+
+CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+                            unsigned int numOptions, CUjit_option *options,
+                            void **optionValues) {
+  return CUDADriverAPI::get()->cuModuleLoadDataEx_(module, image, numOptions,
+                                                   options, optionValues);
+}
+
+CUresult cuModuleUnload(CUmodule hmod) {
+  return CUDADriverAPI::get()->cuModuleUnload_(hmod);
+}
+
+CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                             const char *name) {
+  return CUDADriverAPI::get()->cuModuleGetFunction_(hfunc, hmod, name);
+}
+
+CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod,
+                              const char *name) {
+  return CUDADriverAPI::get()->cuModuleGetGlobal_(dptr, bytes, hmod, name);
+}
+
+CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib,
+                            int value) {
+  return CUDADriverAPI::get()->cuFuncSetAttribute_(hfunc, attrib, value);
+}
+
+CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                        unsigned int gridDimY, unsigned int gridDimZ,
+                        unsigned int blockDimX, unsigned int blockDimY,
+                        unsigned int blockDimZ, unsigned int sharedMemBytes,
+                        CUstream hStream, void **kernelParams, void **extra) {
+  return CUDADriverAPI::get()->cuLaunchKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f,
+                          void **kernelParams, void **extra) {
+  return CUDADriverAPI::get()->cuLaunchKernelEx_(config, f, kernelParams,
+                                                 extra);
+}
+
+CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
+                                   unsigned int gridDimY, unsigned int gridDimZ,
+                                   unsigned int blockDimX,
+                                   unsigned int blockDimY,
+                                   unsigned int blockDimZ,
+                                   unsigned int sharedMemBytes,
+                                   CUstream hStream, void **kernelParams) {
+  return CUDADriverAPI::get()->cuLaunchCooperativeKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  return CUDADriverAPI::get()->cuMemsetD32_(dstDevice, ui, N);
+}
+
+CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                              const CUstreamAttrValue *value) {
+  return CUDADriverAPI::get()->cuStreamSetAttribute_(hStream, attr, value);
+}
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+CUresult cuTensorMapEncodeTiled(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) {
+  auto fn = CUDADriverAPI::get()->cuTensorMapEncodeTiled_;
+  if (fn == nullptr) {
+    return CUDA_ERROR_NOT_SUPPORTED;
+  }
+  return fn(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+            globalStrides, boxDim, elementStrides, interleave, swizzle,
+            l2Promotion, oobFill);
+}
+
+CUresult cuTensorMapEncodeIm2col(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
+    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
+    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) {
+  auto fn = CUDADriverAPI::get()->cuTensorMapEncodeIm2col_;
+  if (fn == nullptr) {
+    return CUDA_ERROR_NOT_SUPPORTED;
+  }
+  return fn(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+            globalStrides, pixelBoxLowerCorner, pixelBoxUpperCorner,
+            channelsPerPixel, pixelsPerColumn, elementStrides, interleave,
+            swizzle, l2Promotion, oobFill);
+}
+#endif
+
+} // extern "C"
diff --git a/src/target/stubs/cuda.h b/src/target/stubs/cuda.h
new file mode 100644
index 000000000..6320b2fb8
--- /dev/null
+++ b/src/target/stubs/cuda.h
@@ -0,0 +1,215 @@
+/**
+ * \file cuda.h
+ * \brief Stub library for lazy loading libcuda.so at runtime.
+ *
+ * This library provides drop-in replacements for CUDA driver API functions.
+ * It allows tilelang to be imported on CPU-only machines without CUDA
+ * installed. The actual libcuda.so is loaded lazily on first API call.
+ *
+ * Usage:
+ *
+ * 1. Link against libcuda_stub.so instead of libcuda.so
+ *
+ * 2. Call CUDA driver API functions normally - they are provided as
+ *    exported global functions with C linkage:
+ *
+ *    ```cpp
+ *    #include "target/stubs/cuda.h"
+ *    CUresult result = cuModuleLoadData(&mod, image);
+ *    ```
+ *
+ * 3. For advanced use, access the singleton directly:
+ *
+ *    ```cpp
+ *    auto* api = tvm::tl::cuda::CUDADriverAPI::get();
+ *    bool available = tvm::tl::cuda::CUDADriverAPI::is_available();
+ *    ```
+ */
+
+#pragma once
+
+// Define guard before including vendor/cuda.h
+// This ensures vendor/cuda.h can only be included through this stub header.
+#define _TILELANG_CUDA_STUB_INCLUDE_GUARD
+
+#include "vendor/cuda.h" // include the full CUDA driver API types
+
+#undef _TILELANG_CUDA_STUB_INCLUDE_GUARD
+
+// Symbol visibility macros for shared library export
+#if defined(_WIN32) || defined(__CYGWIN__)
+#ifdef TILELANG_CUDA_STUB_EXPORTS
+#define TILELANG_CUDA_STUB_API __declspec(dllexport)
+#else
+#define TILELANG_CUDA_STUB_API __declspec(dllimport)
+#endif
+#else
+#define TILELANG_CUDA_STUB_API __attribute__((visibility("default")))
+#endif
+
+// X-macro for listing all required CUDA driver API functions.
+// Format: _(function_name)
+// These are the core functions used by TVM/tilelang CUDA runtime.
+#define TILELANG_LIBCUDA_API_REQUIRED(_)                                       \
+  _(cuGetErrorName)                                                            \
+  _(cuGetErrorString)                                                          \
+  _(cuCtxGetDevice)                                                            \
+  _(cuCtxGetLimit)                                                             \
+  _(cuCtxSetLimit)                                                             \
+  _(cuCtxResetPersistingL2Cache)                                               \
+  _(cuDeviceGetName)                                                           \
+  _(cuDeviceGetAttribute)                                                      \
+  _(cuModuleLoadData)                                                          \
+  _(cuModuleLoadDataEx)                                                        \
+  _(cuModuleUnload)                                                            \
+  _(cuModuleGetFunction)                                                       \
+  _(cuModuleGetGlobal)                                                         \
+  _(cuFuncSetAttribute)                                                        \
+  _(cuLaunchKernel)                                                            \
+  _(cuLaunchKernelEx)                                                          \
+  _(cuLaunchCooperativeKernel)                                                 \
+  _(cuMemsetD32)                                                               \
+  _(cuStreamSetAttribute)
+
+// Optional APIs (may not exist in older drivers or specific configurations)
+// These are loaded but may be nullptr if not available
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+#define TILELANG_LIBCUDA_API_OPTIONAL(_)                                       \
+  _(cuTensorMapEncodeTiled)                                                    \
+  _(cuTensorMapEncodeIm2col)
+#else
+#define TILELANG_LIBCUDA_API_OPTIONAL(_)
+#endif
+
+namespace tvm::tl::cuda {
+
+/**
+ * \brief CUDA Driver API accessor struct with lazy loading support.
+ *
+ * This struct provides lazy loading of libcuda.so symbols at first use,
+ * allowing tilelang to be imported on machines without CUDA installed.
+ * The library handle and function pointers are stored as static members
+ * to ensure one-time initialization.
+ *
+ * Usage:
+ *   CUresult result = CUDADriverAPI::get()->cuModuleLoadData_(&module, image);
+ *
+ * Note: Function pointers have a trailing underscore to differentiate from
+ * the macro-redefined names in cuda.h (e.g., cuModuleGetGlobal ->
+ * cuModuleGetGlobal_v2)
+ */
+struct TILELANG_CUDA_STUB_API CUDADriverAPI {
+// Create function pointer members for each API function
+// The trailing underscore avoids conflict with cuda.h macros
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  TILELANG_LIBCUDA_API_REQUIRED(CREATE_MEMBER)
+  TILELANG_LIBCUDA_API_OPTIONAL(CREATE_MEMBER)
+#undef CREATE_MEMBER
+
+  /**
+   * \brief Get the singleton instance of CUDADriverAPI.
+   *
+   * On first call, this loads libcuda.so and resolves all symbols.
+   * Subsequent calls return the cached instance.
+   *
+   * \return Pointer to the singleton CUDADriverAPI instance.
+   * \throws std::runtime_error if libcuda.so cannot be loaded or
+   *         required symbols are missing.
+   */
+  static CUDADriverAPI *get();
+
+  /**
+   * \brief Check if CUDA driver is available without throwing.
+   *
+   * \return true if libcuda.so can be loaded, false otherwise.
+   */
+  static bool is_available();
+
+  /**
+   * \brief Get the raw library handle for libcuda.so.
+   *
+   * \return The dlopen handle, or nullptr if not loaded.
+   */
+  static void *get_handle();
+};
+
+} // namespace tvm::tl::cuda
+
+// ============================================================================
+// Global wrapper functions for lazy-loaded CUDA driver API
+// ============================================================================
+// These functions provide drop-in replacements for CUDA driver API calls.
+// They are exported from the stub library and can be linked against directly.
+// The implementations are in cuda.cc.
+
+extern "C" {
+
+TILELANG_CUDA_STUB_API CUresult cuGetErrorName(CUresult error,
+                                               const char **pStr);
+TILELANG_CUDA_STUB_API CUresult cuGetErrorString(CUresult error,
+                                                 const char **pStr);
+TILELANG_CUDA_STUB_API CUresult cuCtxGetDevice(CUdevice *device);
+TILELANG_CUDA_STUB_API CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+TILELANG_CUDA_STUB_API CUresult cuCtxSetLimit(CUlimit limit, size_t value);
+TILELANG_CUDA_STUB_API CUresult cuCtxResetPersistingL2Cache(void);
+TILELANG_CUDA_STUB_API CUresult cuDeviceGetName(char *name, int len,
+                                                CUdevice dev);
+TILELANG_CUDA_STUB_API CUresult cuDeviceGetAttribute(int *pi,
+                                                     CUdevice_attribute attrib,
+                                                     CUdevice dev);
+TILELANG_CUDA_STUB_API CUresult cuModuleLoadData(CUmodule *module,
+                                                 const void *image);
+TILELANG_CUDA_STUB_API CUresult cuModuleLoadDataEx(CUmodule *module,
+                                                   const void *image,
+                                                   unsigned int numOptions,
+                                                   CUjit_option *options,
+                                                   void **optionValues);
+TILELANG_CUDA_STUB_API CUresult cuModuleUnload(CUmodule hmod);
+TILELANG_CUDA_STUB_API CUresult cuModuleGetFunction(CUfunction *hfunc,
+                                                    CUmodule hmod,
+                                                    const char *name);
+TILELANG_CUDA_STUB_API CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr,
+                                                     size_t *bytes,
+                                                     CUmodule hmod,
+                                                     const char *name);
+TILELANG_CUDA_STUB_API CUresult cuFuncSetAttribute(CUfunction hfunc,
+                                                   CUfunction_attribute attrib,
+                                                   int value);
+TILELANG_CUDA_STUB_API CUresult cuLaunchKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams, void **extra);
+TILELANG_CUDA_STUB_API CUresult cuLaunchKernelEx(const CUlaunchConfig *config,
+                                                 CUfunction f,
+                                                 void **kernelParams,
+                                                 void **extra);
+TILELANG_CUDA_STUB_API CUresult cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams);
+TILELANG_CUDA_STUB_API CUresult cuMemsetD32_v2(CUdeviceptr dstDevice,
+                                               unsigned int ui, size_t N);
+TILELANG_CUDA_STUB_API CUresult cuStreamSetAttribute(
+    CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *value);
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+TILELANG_CUDA_STUB_API CUresult cuTensorMapEncodeTiled(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+TILELANG_CUDA_STUB_API CUresult cuTensorMapEncodeIm2col(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
+    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
+    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+#endif
+
+} // extern "C"
diff --git a/src/target/cuda.h b/src/target/stubs/vendor/cuda.h
similarity index 99%
rename from src/target/cuda.h
rename to src/target/stubs/vendor/cuda.h
index a9dfb13ab..534831f22 100644
--- a/src/target/cuda.h
+++ b/src/target/stubs/vendor/cuda.h
@@ -47,6 +47,12 @@
  * Users Notice.
  */
 
+// Guard to ensure this header is only included by the stub library
+#ifndef _TILELANG_CUDA_STUB_INCLUDE_GUARD
+#error "vendor/cuda.h should only be included by target/stubs/cuda.h. " \
+       "Do not include this file directly and use target/stubs/cuda.h instead."
+#endif
+
 #ifndef __cuda_cuda_h__
 #define __cuda_cuda_h__
 
diff --git a/src/target/utils.cc b/src/target/utils.cc
index b69e3dd4c..0ffa8192b 100644
--- a/src/target/utils.cc
+++ b/src/target/utils.cc
@@ -6,6 +6,7 @@
 #include "utils.h"
 
 #include "../support/ffi_aliases.h"
+#include "dlpack/dlpack.h"
 #include <tvm/node/node.h>
 
 namespace tvm {
@@ -17,6 +18,9 @@ bool TargetIsCuda(Target target) {
 bool TargetIsRocm(Target target) {
   return target->GetTargetDeviceType() == kDLROCM;
 }
+bool TargetIsMetal(Target target) {
+  return target->GetTargetDeviceType() == kDLMetal;
+}
 
 int GetArchInt(Target target) {
   auto s = target->GetAttr<tvm::ffi::String>("arch");
@@ -60,7 +64,7 @@ bool TargetIsSm100(Target target) {
   if (!TargetIsCuda(target))
     return false;
   int arch = GetArchInt(target);
-  return arch >= 100 & arch <= 110;
+  return arch >= 100 && arch <= 110;
 }
 
 bool TargetIsSM120(Target target) {
@@ -127,6 +131,20 @@ bool TargetHasBulkCopy(Target target) {
   return arch >= 90;
 }
 
+bool TargetSupportVectorize256(Target target) {
+  if (!TargetIsCuda(target))
+    return false;
+  int arch = GetArchInt(target);
+  return arch >= 100;
+}
+
+bool TargetHasSMVersionGE(Target target, int version) {
+  if (!TargetIsCuda(target))
+    return false;
+  int arch = GetArchInt(target);
+  return arch >= version;
+}
+
 int TargetGetWarpSize(Target target) {
   int res = 32;
   if (TargetIsCDNA(target))
@@ -134,6 +152,110 @@ int TargetGetWarpSize(Target target) {
   return res;
 }
 
+bool IsCudaVectorizableFP8(DataType dtype) {
+  // NOTE: E8M0 is a special type of FP8 which is not handled here
+  // We only handle FP8 types which can be represented with
+  // __nv_fp8_interpretation_t here
+  return dtype.is_float8_e4m3() || dtype.is_float8_e4m3fn() ||
+         dtype.is_float8_e5m2();
+}
+
+bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty) {
+  // float16 -> float32
+  if (from_ty.is_float16() && target_ty.is_float() && target_ty.bits() == 32)
+    return true;
+
+  // float32 -> float16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_float16())
+    return true;
+
+  // bfloat16 -> float32
+  if (from_ty.is_bfloat16() && target_ty.is_float() && target_ty.bits() == 32)
+    return true;
+
+  // float32 -> bfloat16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_bfloat16())
+    return true;
+
+  // float32 -> float8 (E4M3/E5M2)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      IsCudaVectorizableFP8(target_ty))
+    return true;
+
+  // float8 (E4M3/E5M2) -> float32
+  if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+      target_ty.bits() == 32)
+    return true;
+
+  // Not implemented for now
+
+  // float64(double) -> float8 (E4M3/E5M2)
+  // if (from_ty.is_float() && from_ty.bits() == 64 &&
+  //     IsCudaVectorizableFP8(target_ty))
+  //   return true;
+
+  // float8 (E4M3/E5M2) -> float64(double)
+  // if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+  //     target_ty.bits() == 64)
+  //   return true;
+
+  // float8 (E8M0) -> bfloat16
+  if (from_ty.is_float8_e8m0fnu() && target_ty.is_bfloat16())
+    return true;
+
+  // bfloat16 -> float8 (E8M0)
+  if (from_ty.is_bfloat16() && target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float32 -> float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float64(double) -> float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float4_e2m1fn -> float16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float16())
+    return true;
+
+  // float16 -> float4_e2m1fn
+  if (from_ty.is_float16() && target_ty.is_float4_e2m1fn())
+    return true;
+
+  // float4_e2m1fn -> float32
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 32)
+    return true;
+
+  // float32 -> float4_e2m1fn
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float4_e2m1fn())
+    return true;
+
+  // float4_e2m1fn -> float64(double)
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 64)
+    return true;
+
+  // float64(double) -> float4_e2m1fn
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float4_e2m1fn())
+    return true;
+
+  // float4_e2m1fn -> bfloat16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_bfloat16())
+    return true;
+
+  // bfloat16 -> float4_e2m1fn
+  if (from_ty.is_bfloat16() && target_ty.is_float4_e2m1fn())
+    return true;
+
+  return false;
+}
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
@@ -141,6 +263,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
            [](Target target) { return TargetIsCuda(target); })
       .def("tl.TargetIsRocm",
            [](Target target) { return TargetIsRocm(target); })
+      .def("tl.TargetIsMetal",
+           [](Target target) { return TargetIsMetal(target); })
       .def("tl.TargetIsVolta",
            [](Target target) { return TargetIsVolta(target); })
       .def("tl.TargetIsTuring",
diff --git a/src/target/utils.h b/src/target/utils.h
index bfd88281c..19e4deb3c 100644
--- a/src/target/utils.h
+++ b/src/target/utils.h
@@ -14,6 +14,7 @@ namespace tl {
 
 bool TargetIsCuda(Target target);
 bool TargetIsRocm(Target target);
+bool TargetIsMetal(Target target);
 
 bool TargetIsVolta(Target target);
 bool TargetIsTuring(Target target);
@@ -28,7 +29,12 @@ bool TargetHasLdmatrix(Target target);
 bool TargetHasStmatrix(Target target);
 bool TargetHasTmem(Target target);
 bool TargetHasBulkCopy(Target target);
+bool TargetSupportVectorize256(Target target);
 int TargetGetWarpSize(Target target);
+bool TargetHasSMVersionGE(Target target, int version);
+
+bool IsCudaVectorizableFP8(DataType dtype);
+bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/tl_templates/cpp/common.h b/src/tl_templates/cpp/common.h
index 0ce6580d3..f1fe801e6 100644
--- a/src/tl_templates/cpp/common.h
+++ b/src/tl_templates/cpp/common.h
@@ -5,4 +5,4 @@
 #include <stdbool.h>
 
 using half_float::half;
-// Not Implemented
\ No newline at end of file
+// Not Implemented
diff --git a/src/tl_templates/cpu/common.h b/src/tl_templates/cpu/common.h
index b288cd114..b69b23186 100644
--- a/src/tl_templates/cpu/common.h
+++ b/src/tl_templates/cpu/common.h
@@ -4,4 +4,4 @@
 #include <stdbool.h>
 
 // Not Implemented
-F
\ No newline at end of file
+F
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 82eeccfda..4de1e43d0 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -12,7 +12,11 @@ using cutlass::bfloat16_t;
 using cutlass::half_t;
 
 #define TL_DEVICE __forceinline__ __device__
-
+#define TL_NOT_IMPLEMENTED()                                                   \
+  {                                                                            \
+    printf("%s not implemented\n", __PRETTY_FUNCTION__);                       \
+    asm volatile("brkpt;\n");                                                  \
+  }
 template <typename T> struct normalize_atomic_type {
   using type = T;
 };
@@ -27,6 +31,10 @@ template <> struct normalize_atomic_type<bfloat16_t> {
 };
 #endif
 
+template <> struct normalize_atomic_type<int64_t> {
+  using type = unsigned long long;
+};
+
 template <typename T1, typename T2> TL_DEVICE T1 cuda_cast(T2 val) {
   return T1(val);
 }
@@ -42,103 +50,289 @@ template <> TL_DEVICE __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
 #endif
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMax(T1 &ref, T2 val,
+TL_DEVICE void AtomicMax(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMax for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMaxRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMin(T1 &ref, T2 val,
+TL_DEVICE void AtomicMin(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMin for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMinRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  T1 *address = ref;
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
 template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    } else {
+      // Since atomic ref do not support memory order, we need to inline ptx
+      // code here for each situation
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      }
+    }
   } else {
-    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+    atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
   }
 }
+#else
+template <typename T1, typename T2>
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
+                         int memory_order = int(cuda::memory_order_relaxed)) {
+  using NT1 = typename normalize_atomic_type<T1>::type;
+  (void)memory_order;
+  atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
+}
+#endif
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      return static_cast<T1>(
+          atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    } else {
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(*reinterpret_cast<__half *>(&ret_val_cast));
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(
+            *reinterpret_cast<__nv_bfloat16 *>(&ret_val_cast));
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
-// TODO add memory_order for vectorized atomic add
-TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
+template <typename src_type>
+TL_DEVICE void AtomicAddx2(half_t *ref, src_type *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
   if (memory_order == int(cuda::memory_order_relaxed)) {
     atomicAdd(reinterpret_cast<half2 *>(ref),
@@ -184,8 +378,9 @@ TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
   }
 }
 
+template <typename src_type>
 TL_DEVICE half2
-AtomicAddx2Ret(half_t *ref, half_t *val,
+AtomicAddx2Ret(half_t *ref, src_type *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
   if (memory_order == int(cuda::memory_order_relaxed)) {
     return atomicAdd(reinterpret_cast<half2 *>(ref),
@@ -229,7 +424,8 @@ AtomicAddx2Ret(half_t *ref, half_t *val,
 }
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
-TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
+template <typename src_type>
+TL_DEVICE void AtomicAddx2(bfloat16_t *ref, src_type *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
   if (memory_order == int(cuda::memory_order_relaxed)) {
     atomicAdd(
@@ -268,8 +464,9 @@ TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
   }
 }
 
+template <typename src_type>
 TL_DEVICE __nv_bfloat162
-AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
+AtomicAddx2Ret(bfloat16_t *ref, src_type *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
   if (memory_order == int(cuda::memory_order_relaxed)) {
     return atomicAdd(
@@ -312,13 +509,19 @@ AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
 #endif
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
-TL_DEVICE void AtomicAddx2(float *ref, float *val,
+template <typename T> TL_DEVICE float2 ToFloat2(T *val) {
+  return *reinterpret_cast<float2 *>(val);
+}
+
+TL_DEVICE float2 ToFloat2(float2 val) { return val; }
+
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(float *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
+  float2 add_val = ToFloat2(val);
   if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<float2 *>(ref),
-              static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+    atomicAdd(reinterpret_cast<float2 *>(ref), add_val);
   } else {
-    float2 add_val = *reinterpret_cast<float2 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float2 ret_val;
     if (memory_order == int(cuda::memory_order_release) ||
@@ -342,14 +545,14 @@ TL_DEVICE void AtomicAddx2(float *ref, float *val,
   }
 }
 
+template <typename ValType>
 TL_DEVICE float2
-AtomicAddx2Ret(float *ref, float *val,
+AtomicAddx2Ret(float *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
+  float2 add_val = ToFloat2(val);
   if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(reinterpret_cast<float2 *>(ref),
-                     static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+    return atomicAdd(reinterpret_cast<float2 *>(ref), add_val);
   } else {
-    float2 add_val = *reinterpret_cast<float2 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float2 ret_val;
     if (memory_order == int(cuda::memory_order_release) ||
@@ -374,16 +577,22 @@ AtomicAddx2Ret(float *ref, float *val,
   }
 }
 
-TL_DEVICE void AtomicAddx4(float *ref, float *val,
+template <typename T> TL_DEVICE float4 ToFloat4(T *val) {
+  return *reinterpret_cast<float4 *>(val);
+}
+
+TL_DEVICE float4 ToFloat4(float4 val) { return val; }
+
+template <typename dst_dtype, typename ValType>
+TL_DEVICE void AtomicAddx4(dst_dtype *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
+  float4 add_val = ToFloat4(val);
   if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<float4 *>(ref),
-              static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+    atomicAdd(reinterpret_cast<float4 *>(ref), add_val);
   } else {
     // Since atomicAdd does not support memory order, atomic_ref does not
     // support vectorized atomic operation we can only inline ptx code here
     // Note: Vectorized atomic operations only support global space
-    float4 add_val = *reinterpret_cast<float4 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float4 ret_val;
     if (memory_order == int(cuda::memory_order_release) ||
@@ -416,14 +625,14 @@ TL_DEVICE void AtomicAddx4(float *ref, float *val,
   }
 }
 
+template <typename dst_dtype, typename ValType>
 TL_DEVICE float4
-AtomicAddx4Ret(float *ref, float *val,
+AtomicAddx4Ret(dst_dtype *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
+  float4 add_val = ToFloat4(val);
   if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(reinterpret_cast<float4 *>(ref),
-                     static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+    return atomicAdd(reinterpret_cast<float4 *>(ref), add_val);
   } else {
-    float4 add_val = *reinterpret_cast<float4 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float4 ret_val;
     if (memory_order == int(cuda::memory_order_release) ||
@@ -456,16 +665,82 @@ AtomicAddx4Ret(float *ref, float *val,
     return ret_val;
   }
 }
+#else
+template <typename T> TL_DEVICE float2 ToFloat2(T *val) {
+  return *reinterpret_cast<float2 *>(val);
+}
+
+TL_DEVICE float2 ToFloat2(float2 val) { return val; }
+
+template <typename T> TL_DEVICE float4 ToFloat4(T *val) {
+  return *reinterpret_cast<float4 *>(val);
+}
+
+TL_DEVICE float4 ToFloat4(float4 val) { return val; }
+
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(float *ref, ValType val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = ToFloat2(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+template <typename ValType>
+TL_DEVICE float2
+AtomicAddx2Ret(float *ref, ValType val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = ToFloat2(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+template <typename dst_dtype, typename ValType>
+TL_DEVICE void AtomicAddx4(dst_dtype *ref, ValType val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = ToFloat4(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+template <typename dst_dtype, typename ValType>
+TL_DEVICE float4
+AtomicAddx4Ret(dst_dtype *ref, ValType val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = ToFloat4(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
 #endif
 
-template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> aref(ref);
+template <typename T> TL_DEVICE T AtomicLoad(T *ref, int memory_order) {
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<T, cuda::thread_scope_device> aref(*ref);
   return aref.load(cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicStore(T1 &ref, T2 value, int memory_order) {
+TL_DEVICE void AtomicStore(T1 *ref, T2 value, int memory_order) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(ref);
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*ref);
   aref.store(cuda_cast<NT1>(value), cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index b92fc73bf..1c4afe438 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -127,6 +127,59 @@ TL_DEVICE int4_t make_int4(signed char x0, signed char x1, signed char x2,
   return result;
 }
 
+TL_DEVICE int4_t make_int4(short x0, short x1, short y0, short y1, short z0,
+                           short z1, short w0, short w1) {
+  int4_t result;
+  *((short2 *)&result.x) = make_short2(x0, x1);
+  *((short2 *)&result.y) = make_short2(y0, y1);
+  *((short2 *)&result.z) = make_short2(z0, z1);
+  *((short2 *)&result.w) = make_short2(w0, w1);
+  return result;
+}
+
+// Pack four char values.
+TL_DEVICE unsigned int make_uint(unsigned char x0, unsigned char x1,
+                                 unsigned char x2, unsigned char x3) {
+  return (x3 << 24) | (x2 << 16) | (x1 << 8) | x0;
+}
+
+// Pack eight char values.
+TL_DEVICE uint2 make_uint2(unsigned char x0, unsigned char x1, unsigned char x2,
+                           unsigned char x3, unsigned char y0, unsigned char y1,
+                           unsigned char y2, unsigned char y3) {
+  uint2 result;
+  result.x = make_uint(x0, x1, x2, x3);
+  result.y = make_uint(y0, y1, y2, y3);
+  return result;
+}
+
+// Pack sixteen char values.
+TL_DEVICE uint4 make_uint4(unsigned char x0, unsigned char x1, unsigned char x2,
+                           unsigned char x3, unsigned char y0, unsigned char y1,
+                           unsigned char y2, unsigned char y3, unsigned char z0,
+                           unsigned char z1, unsigned char z2, unsigned char z3,
+                           unsigned char w0, unsigned char w1, unsigned char w2,
+                           unsigned char w3) {
+  uint4 result;
+  result.x = make_uint(x0, x1, x2, x3);
+  result.y = make_uint(y0, y1, y2, y3);
+  result.z = make_uint(z0, z1, z2, z3);
+  result.w = make_uint(w0, w1, w2, w3);
+  return result;
+}
+
+TL_DEVICE uint4 make_uint4(unsigned short x0, unsigned short x1,
+                           unsigned short y0, unsigned short y1,
+                           unsigned short z0, unsigned short z1,
+                           unsigned short w0, unsigned short w1) {
+  uint4 result;
+  *((ushort2 *)&result.x) = make_ushort2(x0, x1);
+  *((ushort2 *)&result.y) = make_ushort2(y0, y1);
+  *((ushort2 *)&result.z) = make_ushort2(z0, z1);
+  *((ushort2 *)&result.w) = make_ushort2(w0, w1);
+  return result;
+}
+
 // Pack eight int values.
 TL_DEVICE longlong4 make_longlong4(int x0, int x1, int y0, int y1, int z0,
                                    int z1, int w0, int w1) {
@@ -500,6 +553,10 @@ struct float_e4m3_t : public cute::float_e4m3_t {
   CUTLASS_HOST_DEVICE
   explicit float_e4m3_t(__nv_bfloat16 x)
       : float_e4m3_t(static_cast<float>(x)) {}
+
+  CUTLASS_HOST_DEVICE
+  float_e4m3_t(cutlass::float_e4m3_t x)
+      : cute::float_e4m3_t(*reinterpret_cast<cute::float_e4m3_t *>(&x)) {}
 };
 
 struct float_e5m2_t : public cute::float_e5m2_t {
@@ -510,6 +567,10 @@ struct float_e5m2_t : public cute::float_e5m2_t {
   CUTLASS_HOST_DEVICE
   explicit float_e5m2_t(__nv_bfloat16 x)
       : float_e5m2_t(static_cast<float>(x)) {}
+
+  CUTLASS_HOST_DEVICE
+  float_e5m2_t(cutlass::float_e5m2_t x)
+      : cute::float_e5m2_t(*reinterpret_cast<cute::float_e5m2_t *>(&x)) {}
 };
 
 template <typename T> struct to_cute_type {
diff --git a/src/tl_templates/cuda/copy.h b/src/tl_templates/cuda/copy.h
index 1dd538434..699a950a0 100644
--- a/src/tl_templates/cuda/copy.h
+++ b/src/tl_templates/cuda/copy.h
@@ -26,7 +26,8 @@ template <int N> TL_DEVICE void cp_async_wait() {
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
+TL_DEVICE void cp_async_gs(void const *const smem_addr,
+                           void const *global_ptr) {
   static_assert(N == 16 || N == 8 || N == 4);
   unsigned int addr = smem_ptr_to_uint(smem_addr);
   if constexpr (N == 16) {
@@ -37,7 +38,7 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.cg.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -46,13 +47,13 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.ca.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
-                                       void *global_ptr, bool cond) {
+                                       void const *global_ptr, bool cond) {
   static_assert(N == 16 || N == 8 || N == 4);
   int bytes = cond ? N : 0;
   unsigned int addr = smem_ptr_to_uint(smem_addr);
@@ -64,7 +65,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.cg.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -73,8 +74,201 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.ca.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   }
 }
 
+// Global memory load intrinsics with explicit vector widths
+// Following CUTLASS style with template specialization
+
+// Primary template declaration
+template <typename AccessType, int LoadBytes> struct global_load;
+
+// ldg32: Load 32 bits (4 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 4> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    unsigned &data = reinterpret_cast<unsigned &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %2, 0;\n"
+                 "  mov.b32 %0, %3;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.u32 %0, [%1];\n"
+#else
+                 "  @p ld.global.u32 %0, [%1];\n"
+#endif
+                 "}\n"
+                 : "=r"(data)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+// ldg64: Load 64 bits (8 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 8> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    uint2 &data = reinterpret_cast<uint2 &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %3, 0;\n"
+                 "  mov.b32 %0, %4;\n"
+                 "  mov.b32 %1, %5;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v2.u32 {%0, %1}, [%2];\n"
+#else
+                 "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+#endif
+                 "}\n"
+                 : "=r"(data.x), "=r"(data.y)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+// ldg128: Load 128 bits (16 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 16> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  mov.b32 %0, %6;\n"
+                 "  mov.b32 %1, %7;\n"
+                 "  mov.b32 %2, %8;\n"
+                 "  mov.b32 %3, %9;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#else
+                 "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#endif
+                 "}\n"
+                 : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y),
+                   "r"(data.z), "r"(data.w));
+  }
+};
+
+// Convenience wrapper functions for direct use
+// load_global_32: Load 32 bits, return uint32_t
+template <typename T> TL_DEVICE uint32_t load_global_32(const T *ptr) {
+  uint32_t ret{};
+  global_load<uint32_t, 4>(ret, ptr, true);
+  return ret;
+}
+
+// load_global_64: Load 64 bits, return uint64_t
+template <typename T> TL_DEVICE uint2 load_global_64(const T *ptr) {
+  uint2 ret{};
+  global_load<uint2, 8>(ret, ptr, true);
+  return ret;
+}
+
+// load_global_128: Load 128 bits, return uint4
+template <typename T> TL_DEVICE uint4 load_global_128(const T *ptr) {
+  uint4 ret{};
+  global_load<uint4, 16>(ret, ptr, true);
+  return ret;
+}
+
+// Predicated (conditional) versions
+template <typename T>
+TL_DEVICE uint32_t load_global_32_conditional(const T *ptr, bool pred) {
+  uint32_t ret{};
+  global_load<uint32_t, 4>(ret, ptr, pred);
+  return ret;
+}
+
+template <typename T>
+TL_DEVICE uint2 load_global_64_conditional(const T *ptr, bool pred) {
+  uint2 ret{};
+  global_load<uint2, 8>(ret, ptr, pred);
+  return ret;
+}
+
+template <typename T>
+TL_DEVICE uint4 load_global_128_conditional(const T *ptr, bool pred) {
+  uint4 ret{};
+  global_load<uint4, 16>(ret, ptr, pred);
+  return ret;
+}
+
+// Global memory store intrinsics with explicit vector widths
+// Following CUTLASS style with template specialization
+
+// Primary template declaration
+template <typename AccessType, int StoreBytes> struct global_store;
+
+// stg32: Store 32 bits (4 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 4> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    unsigned const &data = reinterpret_cast<unsigned const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %2, 0;\n"
+                 "  @p st.global.u32 [%0], %1;\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data), "r"((int)pred_guard));
+  }
+};
+
+// stg64: Store 64 bits (8 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 8> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    uint2 const &data = reinterpret_cast<uint2 const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %3, 0;\n"
+                 "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
+  }
+};
+
+// stg128: Store 128 bits (16 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 16> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    uint4 const &data = reinterpret_cast<uint4 const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
+                   "r"((int)pred_guard));
+  }
+};
+
+// Convenience wrapper functions for direct use
+// store_global_32: Store 32 bits
+template <typename T> TL_DEVICE void store_global_32(T *ptr, uint32_t value) {
+  global_store<uint32_t, 4>(ptr, value, true);
+}
+
+// store_global_64: Store 64 bits
+template <typename T> TL_DEVICE void store_global_64(T *ptr, uint2 value) {
+  global_store<uint2, 8>(ptr, value, true);
+}
+
+// store_global_128: Store 128 bits
+template <typename T> TL_DEVICE void store_global_128(T *ptr, uint4 value) {
+  global_store<uint4, 16>(ptr, value, true);
+}
+
+// Predicated (conditional) versions
+template <typename T>
+TL_DEVICE void store_global_32_conditional(T *ptr, uint32_t value, bool pred) {
+  global_store<uint32_t, 4>(ptr, value, pred);
+}
+
+template <typename T>
+TL_DEVICE void store_global_64_conditional(T *ptr, uint2 value, bool pred) {
+  global_store<uint2, 8>(ptr, value, pred);
+}
+
+template <typename T>
+TL_DEVICE void store_global_128_conditional(T *ptr, uint4 value, bool pred) {
+  global_store<uint4, 16>(ptr, value, pred);
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index c4047c349..bcd22e225 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -5,51 +5,293 @@
 
 namespace tl {
 
-__device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
-  longlong4 ret;
-  asm volatile("ld.global.v4.s64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+// 256-bit global load template following CUTLASS style
+// Primary template declaration
+template <typename AccessType> struct global_load_256;
+
+// 256-bit load specialization for longlong4
+template <> struct global_load_256<longlong4> {
+  __device__ __forceinline__ global_load_256(longlong4 &D, void const *ptr,
+                                             bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  mov.b64 %0, %6;\n"
+                 "  mov.b64 %1, %7;\n"
+                 "  mov.b64 %2, %8;\n"
+                 "  mov.b64 %3, %9;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v4.s64 {%0, %1, %2, %3}, [%4];\n"
+#else
+                 "  @p ld.global.v4.s64 {%0, %1, %2, %3}, [%4];\n"
+#endif
+                 "}\n"
+                 : "=l"(D.x), "=l"(D.y), "=l"(D.z), "=l"(D.w)
+                 : "l"(ptr), "r"((int)pred_guard), "l"(D.x), "l"(D.y), "l"(D.z),
+                   "l"(D.w));
+#else
+    // CUDA < 12.9 fallback: two 128-bit loads (may have performance regression)
+    uint4 *data = reinterpret_cast<uint4 *>(&D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+#if TL_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.L2::128B.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#else
+        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#endif
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+#endif
+  }
+};
+
+// 256-bit load specialization for ulonglong4
+template <> struct global_load_256<ulonglong4> {
+  __device__ __forceinline__ global_load_256(ulonglong4 &D, void const *ptr,
+                                             bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  mov.b64 %0, %6;\n"
+                 "  mov.b64 %1, %7;\n"
+                 "  mov.b64 %2, %8;\n"
+                 "  mov.b64 %3, %9;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v4.u64 {%0, %1, %2, %3}, [%4];\n"
+#else
+                 "  @p ld.global.v4.u64 {%0, %1, %2, %3}, [%4];\n"
+#endif
+                 "}\n"
+                 : "=l"(D.x), "=l"(D.y), "=l"(D.z), "=l"(D.w)
+                 : "l"(ptr), "r"((int)pred_guard), "l"(D.x), "l"(D.y), "l"(D.z),
+                   "l"(D.w));
+#else
+    // CUDA < 12.9 fallback: two 128-bit loads (may have performance regression)
+    uint4 *data = reinterpret_cast<uint4 *>(&D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+#if TL_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.L2::128B.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#else
+        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#endif
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+#endif
+  }
+};
+
+// Convenience wrapper functions
+__device__ __forceinline__ longlong4 load_global_256(const longlong4 *ptr) {
+  longlong4 ret{};
+  global_load_256<longlong4>(ret, ptr, true);
   return ret;
 }
 
-__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
-  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+__device__ __forceinline__ ulonglong4 load_global_256(const ulonglong4 *ptr) {
+  ulonglong4 ret{};
+  global_load_256<ulonglong4>(ret, ptr, true);
+  return ret;
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+// Predicated (conditional) versions
+__device__ __forceinline__ longlong4
+load_global_256_conditional(const longlong4 *ptr, bool pred) {
+  longlong4 ret{};
+  global_load_256<longlong4>(ret, ptr, pred);
   return ret;
 }
 
-// must be const &val, otherwise the compiler will generate a temporary variable
-// and compilation will fail if we have st_global_256(ptr, ld_global_256(ptr))
-__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
-                                              const ulonglong4 &val) {
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+__device__ __forceinline__ ulonglong4
+load_global_256_conditional(const ulonglong4 *ptr, bool pred) {
+  ulonglong4 ret{};
+  global_load_256<ulonglong4>(ret, ptr, pred);
+  return ret;
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e4_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+// Generic 256-bit load for FP8 and other types (returns ulonglong4)
+template <typename T>
+__device__ __forceinline__ ulonglong4 load_global_256(const T *ptr) {
+  ulonglong4 ret{};
+  global_load_256<ulonglong4>(ret, ptr, true);
   return ret;
 }
 
-__device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
-                                              fp8_e4_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+template <typename T>
+__device__ __forceinline__ ulonglong4 load_global_256_conditional(const T *ptr,
+                                                                  bool pred) {
+  ulonglong4 ret{};
+  global_load_256<ulonglong4>(ret, ptr, pred);
+  return ret;
+}
+
+// 256-bit global store template following CUTLASS style
+template <typename AccessType> struct global_store_256;
+
+// 256-bit store specialization for longlong4
+template <> struct global_store_256<longlong4> {
+  __device__ __forceinline__ global_store_256(longlong4 const &D, void *ptr,
+                                              bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.s64 [%0], {%1, %2, %3, %4};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "l"(D.x), "l"(D.y), "l"(D.z), "l"(D.w),
+                   "r"((int)pred_guard));
+#else
+    // CUDA < 12.9 fallback: two 128-bit stores (may have performance
+    // regression)
+    uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+                   "r"(data[0].w), "r"((int)pred_guard),
+                   "l"(((uint8_t *)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
+                   "r"(data[1].z), "r"(data[1].w));
+#endif
+  }
+};
+
+// 256-bit store specialization for ulonglong4
+template <> struct global_store_256<ulonglong4> {
+  __device__ __forceinline__ global_store_256(ulonglong4 const &D, void *ptr,
+                                              bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.u64 [%0], {%1, %2, %3, %4};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "l"(D.x), "l"(D.y), "l"(D.z), "l"(D.w),
+                   "r"((int)pred_guard));
+#else
+    // CUDA < 12.9 fallback: two 128-bit stores (may have performance
+    // regression)
+    uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+                   "r"(data[0].w), "r"((int)pred_guard),
+                   "l"(((uint8_t *)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
+                   "r"(data[1].z), "r"(data[1].w));
+#endif
+  }
+};
+
+// Convenience wrapper functions for 256-bit store
+__device__ __forceinline__ void store_global_256(longlong4 *ptr,
+                                                 longlong4 &val) {
+  global_store_256<longlong4>(val, ptr, true);
+}
+
+__device__ __forceinline__ void store_global_256(longlong4 *ptr,
+                                                 const longlong4 &val) {
+  global_store_256<longlong4>(val, ptr, true);
+}
+
+__device__ __forceinline__ void store_global_256(ulonglong4 *ptr,
+                                                 ulonglong4 &val) {
+  global_store_256<ulonglong4>(val, ptr, true);
+}
+
+__device__ __forceinline__ void store_global_256(ulonglong4 *ptr,
+                                                 const ulonglong4 &val) {
+  global_store_256<ulonglong4>(val, ptr, true);
+}
+
+// Predicated (conditional) versions
+__device__ __forceinline__ void
+store_global_256_conditional(longlong4 *ptr, longlong4 &val, bool pred) {
+  global_store_256<longlong4>(val, ptr, pred);
+}
+
+__device__ __forceinline__ void
+store_global_256_conditional(ulonglong4 *ptr, ulonglong4 &val, bool pred) {
+  global_store_256<ulonglong4>(val, ptr, pred);
+}
+
+__device__ __forceinline__ void
+store_global_256_conditional(ulonglong4 *ptr, const ulonglong4 &val,
+                             bool pred) {
+  global_store_256<ulonglong4>(val, ptr, pred);
+}
+
+// Generic 256-bit store for FP8 and other types
+template <typename T>
+__device__ __forceinline__ void store_global_256(T *ptr,
+                                                 const ulonglong4 &val) {
+  global_store_256<ulonglong4>(val, ptr, true);
+}
+
+template <typename T>
+__device__ __forceinline__ void
+store_global_256_conditional(T *ptr, const ulonglong4 &val, bool pred) {
+  global_store_256<ulonglong4>(val, ptr, pred);
+}
+
+template <typename T>
+__device__ __forceinline__ void store_global_256(T *ptr, T &val) {
+  ulonglong4 const &val_u64 = *reinterpret_cast<ulonglong4 const *>(&val);
+  global_store_256<ulonglong4>(val_u64, ptr, true);
+}
+
+template <typename T>
+__device__ __forceinline__ void store_global_256_conditional(T *ptr, T &val,
+                                                             bool pred) {
+  ulonglong4 const &val_u64 = *reinterpret_cast<ulonglong4 const *>(&val);
+  global_store_256<ulonglong4>(val_u64, ptr, pred);
 }
 
 __device__ __forceinline__ unsigned long long
@@ -95,38 +337,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index b8b174dc4..3d5b3f414 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -15,14 +15,14 @@ enum class CacheHintSm90 : uint64_t {
 };
 
 template <typename BarrierType = uint64_t>
-TL_DEVICE void tma_load(void *smem_ptr, void *gmem_ptr, BarrierType &smem_mbar,
-                        uint32_t size) {
+TL_DEVICE void tma_load(void *smem_ptr, void const *gmem_ptr,
+                        BarrierType &smem_mbar, uint32_t size) {
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::"
                "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
-               "l"(gmem_ptr), "r"(size), "r"(smem_int_mbar)
+               "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
                :);
 }
 
@@ -262,6 +262,74 @@ TL_DEVICE void tma_store_add(float *const smem_ptr, float *gmem_ptr,
                : "memory");
 }
 
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.bulk_group "
+      "[%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2,
+                             int32_t const &crd3) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2),
+        "r"(crd3)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2,
+                             int32_t const &crd3, int32_t const &crd4) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2),
+        "r"(crd3), "r"(crd4)
+      : "memory");
+}
+
 TL_DEVICE void prefetch_tma_descriptor(const CUtensorMap &descriptor) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   asm volatile("prefetch.tensormap [%0];" : : "l"(gmem_int_desc) : "memory");
diff --git a/src/tl_templates/cuda/cuda_fp4.h b/src/tl_templates/cuda/cuda_fp4.h
new file mode 100644
index 000000000..d74ee6759
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_fp4.h
@@ -0,0 +1,307 @@
+#pragma once
+
+#include "common.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#include <cuda_fp4.h>
+
+// Wrapper for __nv_fp4_e2m1 with implicit conversions
+struct fp4_e2_t {
+  __nv_fp4_storage_t __x;
+
+  TL_DEVICE fp4_e2_t() = default;
+
+  // Constructor from __nv_fp4_e2m1
+  TL_DEVICE fp4_e2_t(__nv_fp4_e2m1 x) : __x(x.__x) {}
+
+  // Constructor from storage type
+  TL_DEVICE fp4_e2_t(__nv_fp4_storage_t x) : __x(x) {}
+
+  // Constructor from float
+  TL_DEVICE explicit fp4_e2_t(float x) {
+    __nv_fp4_e2m1 tmp(x);
+    __x = tmp.__x;
+  }
+
+  // Conversion to __nv_fp4_e2m1
+  TL_DEVICE operator __nv_fp4_e2m1() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return tmp;
+  }
+
+  // Conversion to float
+  TL_DEVICE operator float() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return float(tmp);
+  }
+
+  // Implicit conversion to half_t (cutlass::half_t)
+  TL_DEVICE operator half_t() const { return half_t(float(*this)); }
+
+  // Implicit conversion to __half
+  TL_DEVICE operator __half() const { return __half(float(*this)); }
+};
+
+class fp4_e2_2_t {
+public:
+  __nv_fp4x2_storage_t __x;
+
+  TL_DEVICE fp4_e2_2_t() = default;
+  TL_DEVICE fp4_e2_2_t(__nv_fp4x2_storage_t data) : __x(data) {}
+  TL_DEVICE fp4_e2_2_t(__nv_fp4x2_e2m1 data) : __x(data.__x) {}
+
+  // Get low 4 bits (first fp4)
+  TL_DEVICE fp4_e2_t x() const {
+    return fp4_e2_t(__nv_fp4_storage_t(__x & 0x0F));
+  }
+
+  // Get high 4 bits (second fp4)
+  TL_DEVICE fp4_e2_t y() const {
+    return fp4_e2_t(__nv_fp4_storage_t((__x >> 4) & 0x0F));
+  }
+
+  // Set low 4 bits (first fp4)
+  TL_DEVICE void set_x(fp4_e2_t val) { __x = (__x & 0xF0) | (val.__x & 0x0F); }
+
+  // Set high 4 bits (second fp4)
+  TL_DEVICE void set_y(fp4_e2_t val) {
+    __x = (__x & 0x0F) | ((val.__x & 0x0F) << 4);
+  }
+};
+
+struct __CUDA_ALIGN__(2) fp4_e2_4_t {
+  fp4_e2_2_t x;
+  fp4_e2_2_t y;
+};
+
+struct __CUDA_ALIGN__(4) fp4_e2_8_t {
+  fp4_e2_4_t x;
+  fp4_e2_4_t y;
+};
+
+struct __CUDA_ALIGN__(8) fp4_e2_16_t {
+  fp4_e2_8_t x;
+  fp4_e2_8_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp4_e2_32_t {
+  fp4_e2_16_t x;
+  fp4_e2_16_t y;
+
+  TL_DEVICE fp4_e2_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp4_e2_8_t *)&rhs.x;
+    x.y = *(fp4_e2_8_t *)&rhs.y;
+    y.x = *(fp4_e2_8_t *)&rhs.z;
+    y.y = *(fp4_e2_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
+struct __CUDA_ALIGN__(32) fp4_e2_64_t {
+  fp4_e2_32_t x;
+  fp4_e2_32_t y;
+};
+
+// Pack two fp4_e2_t values.
+TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
+  __nv_fp4x2_storage_t packed = (x.__x & 0x0F) | ((y.__x & 0x0F) << 4);
+  fp4_e2_2_t result;
+  result.__x = packed;
+  return result;
+}
+
+// Pack four fp4_e2_t values.
+TL_DEVICE fp4_e2_4_t make_fp4_e2_4_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3) {
+  fp4_e2_4_t result;
+  result.x = make_fp4_e2_2_t(x0, x1);
+  result.y = make_fp4_e2_2_t(x2, x3);
+  return result;
+}
+
+// Pack eight fp4_e2_t values.
+TL_DEVICE fp4_e2_8_t make_fp4_e2_8_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                     fp4_e2_t x6, fp4_e2_t x7) {
+  fp4_e2_8_t result;
+  result.x = make_fp4_e2_4_t(x0, x1, x2, x3);
+  result.y = make_fp4_e2_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp4_e2_t values.
+TL_DEVICE fp4_e2_16_t make_fp4_e2_16_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                       fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                       fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t y0,
+                                       fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+                                       fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6,
+                                       fp4_e2_t y7) {
+  fp4_e2_16_t result;
+  result.x = make_fp4_e2_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp4_e2_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp4_e2_t values.
+TL_DEVICE fp4_e2_32_t make_fp4_e2_32_t(
+    fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2, fp4_e2_t x3, fp4_e2_t x4,
+    fp4_e2_t x5, fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t x8, fp4_e2_t x9,
+    fp4_e2_t x10, fp4_e2_t x11, fp4_e2_t x12, fp4_e2_t x13, fp4_e2_t x14,
+    fp4_e2_t x15, fp4_e2_t y0, fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+    fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6, fp4_e2_t y7, fp4_e2_t y8,
+    fp4_e2_t y9, fp4_e2_t y10, fp4_e2_t y11, fp4_e2_t y12, fp4_e2_t y13,
+    fp4_e2_t y14, fp4_e2_t y15) {
+  fp4_e2_32_t result;
+  result.x = make_fp4_e2_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp4_e2_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+// ============================================================================
+// FP4 <-> Half Precision Conversions
+// ============================================================================
+// https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__FP4__MISC.html
+
+// fp4_e2m1 -> half
+TL_DEVICE __half __tl_cvt_fp4_to_half(const __nv_fp4_storage_t src) {
+  __half_raw raw = __nv_cvt_fp4_to_halfraw(src, __NV_E2M1);
+  __half result;
+  result = *reinterpret_cast<__half *>(&raw);
+  return result;
+}
+
+// fp4_e2m1x2 (1 byte) -> half2
+TL_DEVICE half2 __tl_cvt_fp4x2_to_half2(const __nv_fp4x2_storage_t src) {
+  __half2_raw raw = __nv_cvt_fp4x2_to_halfraw2(src, __NV_E2M1);
+  half2 result;
+  result = *reinterpret_cast<half2 *>(&raw);
+  return result;
+}
+
+// half -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_half_to_fp4(const __half src) {
+  __half_raw raw = *reinterpret_cast<const __half_raw *>(&src);
+  return __nv_cvt_halfraw_to_fp4(raw, __NV_E2M1, cudaRoundNearest);
+}
+
+// half2 -> fp4_e2m1x2 (1 byte)
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_half2_to_fp4x2(const half2 src) {
+  __half2_raw raw = *reinterpret_cast<const __half2_raw *>(&src);
+  return __nv_cvt_halfraw2_to_fp4x2(raw, __NV_E2M1, cudaRoundNearest);
+}
+
+// ============================================================================
+// FP4 <-> Float Conversions
+// ============================================================================
+
+// fp4_e2m1 -> float
+TL_DEVICE float __tl_cvt_fp4_to_float(const __nv_fp4_storage_t src) {
+  return __half2float(__tl_cvt_fp4_to_half(src));
+}
+
+// fp4_e2m1x2 (1 byte) -> float2
+TL_DEVICE float2 __tl_cvt_fp4x2_to_float2(const __nv_fp4x2_storage_t src) {
+  half2 tmp = __tl_cvt_fp4x2_to_half2(src);
+  float2 result;
+  result.x = __half2float(tmp.x);
+  result.y = __half2float(tmp.y);
+  return result;
+}
+
+// float -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_float_to_fp4(const float src) {
+  return __nv_cvt_float_to_fp4(src, __NV_E2M1, cudaRoundNearest);
+}
+
+// float2 -> fp4_e2m1x2 (1 byte)
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_float2_to_fp4x2(const float2 src) {
+  return __nv_cvt_float2_to_fp4x2(src, __NV_E2M1, cudaRoundNearest);
+}
+
+// ============================================================================
+// FP4 <-> Double Conversions
+// ============================================================================
+
+// fp4_e2m1 -> double
+TL_DEVICE double __tl_cvt_fp4_to_double(const __nv_fp4_storage_t src) {
+  return static_cast<double>(__tl_cvt_fp4_to_float(src));
+}
+
+// fp4_e2m1x2 -> double2
+TL_DEVICE double2 __tl_cvt_fp4x2_to_double2(const __nv_fp4x2_storage_t src) {
+  float2 tmp = __tl_cvt_fp4x2_to_float2(src);
+  double2 result;
+  result.x = static_cast<double>(tmp.x);
+  result.y = static_cast<double>(tmp.y);
+  return result;
+}
+
+// double -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_double_to_fp4(const double src) {
+  return __nv_cvt_double_to_fp4(src, __NV_E2M1, cudaRoundNearest);
+}
+
+// double2 -> fp4_e2m1x2
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_double2_to_fp4x2(const double2 src) {
+  return __nv_cvt_double2_to_fp4x2(src, __NV_E2M1, cudaRoundNearest);
+}
+
+// ============================================================================
+// FP4 <-> BFloat16 Conversions
+// ============================================================================
+
+// fp4_e2m1 -> bfloat16
+TL_DEVICE __nv_bfloat16 __tl_cvt_fp4_to_bfloat16(const __nv_fp4_storage_t src) {
+  return __float2bfloat16(__tl_cvt_fp4_to_float(src));
+}
+
+// fp4_e2m1x2 -> bfloat162
+TL_DEVICE __nv_bfloat162
+__tl_cvt_fp4x2_to_bfloat162(const __nv_fp4x2_storage_t src) {
+  float2 tmp = __tl_cvt_fp4x2_to_float2(src);
+  return __floats2bfloat162_rn(tmp.x, tmp.y);
+}
+
+// bfloat16 -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_bfloat16_to_fp4(const __nv_bfloat16 src) {
+  __nv_bfloat16_raw raw = *reinterpret_cast<const __nv_bfloat16_raw *>(&src);
+  return __nv_cvt_bfloat16raw_to_fp4(raw, __NV_E2M1, cudaRoundNearest);
+}
+
+// bfloat162 -> fp4_e2m1x2
+TL_DEVICE __nv_fp4x2_storage_t
+__tl_cvt_bfloat162_to_fp4x2(const __nv_bfloat162 src) {
+  __nv_bfloat162_raw raw = *reinterpret_cast<const __nv_bfloat162_raw *>(&src);
+  return __nv_cvt_bfloat16raw2_to_fp4x2(raw, __NV_E2M1, cudaRoundNearest);
+}
+
+// ============================================================================
+// FP4 Packed Buffer Access Helpers
+// ============================================================================
+// These helpers are used for accessing individual fp4 elements from packed
+// fp4_e2_2_t storage, where each byte stores 2 fp4 values.
+
+// Load a single fp4 element from packed storage
+// packed: pointer to fp4_e2_2_t array
+// idx: logical index of the fp4 element
+TL_DEVICE fp4_e2_t tl_fp4_packed_load(fp4_e2_2_t *packed, int idx) {
+  return (idx & 1) ? packed[idx >> 1].y() : packed[idx >> 1].x();
+}
+
+// Store a single fp4 element to packed storage
+// packed: pointer to fp4_e2_2_t array
+// idx: logical index of the fp4 element
+// val: value to store
+TL_DEVICE void tl_fp4_packed_store(fp4_e2_2_t *packed, int idx, fp4_e2_t val) {
+  if (idx & 1) {
+    packed[idx >> 1].set_y(val);
+  } else {
+    packed[idx >> 1].set_x(val);
+  }
+}
+
+#endif
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index 2efb8f111..3b8422700 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -7,6 +7,19 @@
 using fp8_e4_t = tl::float_e4m3_t;
 using fp8_e5_t = tl::float_e5m2_t;
 
+// __nv_fp8_e8m0 is only available in CUDA 12.6+
+#if __CUDACC_VER_MAJOR__ > 12 ||                                               \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 6)
+using fp8_e8_t = __nv_fp8_e8m0;
+#define TL_HAS_FP8_E8M0 1
+#else
+// Placeholder for CUDA < 12.6
+struct fp8_e8_t {
+  unsigned char data;
+};
+#define TL_HAS_FP8_E8M0 0
+#endif
+
 struct __CUDA_ALIGN__(2) fp8_e4_2_t {
   fp8_e4_t x;
   fp8_e4_t y;
@@ -33,7 +46,7 @@ struct __CUDA_ALIGN__(32) fp8_e4_32_t {
   fp8_e4_16_t x;
   fp8_e4_16_t y;
 
-  __device__ __forceinline__ fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e4_8_t *)&rhs.x;
     x.y = *(fp8_e4_8_t *)&rhs.y;
     y.x = *(fp8_e4_8_t *)&rhs.z;
@@ -68,7 +81,7 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   fp8_e5_16_t x;
   fp8_e5_16_t y;
 
-  __device__ __forceinline__ fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e5_8_t *)&rhs.x;
     x.y = *(fp8_e5_8_t *)&rhs.y;
     y.x = *(fp8_e5_8_t *)&rhs.z;
@@ -77,8 +90,43 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   }
 };
 
+struct __CUDA_ALIGN__(2) fp8_e8_2_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+};
+
+struct __CUDA_ALIGN__(4) fp8_e8_4_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+  fp8_e8_t z;
+  fp8_e8_t w;
+};
+
+struct __CUDA_ALIGN__(8) fp8_e8_8_t {
+  fp8_e8_4_t x;
+  fp8_e8_4_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp8_e8_16_t {
+  fp8_e8_8_t x;
+  fp8_e8_8_t y;
+};
+
+struct __CUDA_ALIGN__(32) fp8_e8_32_t {
+  fp8_e8_16_t x;
+  fp8_e8_16_t y;
+
+  TL_DEVICE fp8_e8_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp8_e8_8_t *)&rhs.x;
+    x.y = *(fp8_e8_8_t *)&rhs.y;
+    y.x = *(fp8_e8_8_t *)&rhs.z;
+    y.y = *(fp8_e8_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
 // Pack two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
+TL_DEVICE fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
   fp8_e4_2_t result;
   result.x = x;
   result.y = y;
@@ -86,9 +134,8 @@ __forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
 }
 
 // Pack four fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2,
-                                                      fp8_e4_t x3) {
+TL_DEVICE fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3) {
   fp8_e4_4_t result;
   result.x = x0;
   result.y = x1;
@@ -98,11 +145,9 @@ __forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack eight fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2, fp8_e4_t x3,
-                                                      fp8_e4_t x4, fp8_e4_t x5,
-                                                      fp8_e4_t x6,
-                                                      fp8_e4_t x7) {
+TL_DEVICE fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                     fp8_e4_t x6, fp8_e4_t x7) {
   fp8_e4_8_t result;
   result.x = make_fp8_e4_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e4_4_t(x4, x5, x6, x7);
@@ -110,11 +155,12 @@ __forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack sixteen fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_16_t
-make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
-                 fp8_e4_t x4, fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7,
-                 fp8_e4_t y0, fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
-                 fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6, fp8_e4_t y7) {
+TL_DEVICE fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                       fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                       fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                       fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                       fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                       fp8_e4_t y7) {
   fp8_e4_16_t result;
   result.x = make_fp8_e4_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e4_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -122,7 +168,7 @@ make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
 }
 
 // Pack thirty-two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
+TL_DEVICE fp8_e4_32_t make_fp8_e4_32_t(
     fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3, fp8_e4_t x4,
     fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t x8, fp8_e4_t x9,
     fp8_e4_t x10, fp8_e4_t x11, fp8_e4_t x12, fp8_e4_t x13, fp8_e4_t x14,
@@ -139,7 +185,7 @@ __forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
 }
 
 // Pack two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
+TL_DEVICE fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
   fp8_e5_2_t result;
   result.x = x;
   result.y = y;
@@ -147,9 +193,8 @@ __forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
 }
 
 // Pack four fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2,
-                                                      fp8_e5_t x3) {
+TL_DEVICE fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3) {
   fp8_e5_4_t result;
   result.x = x0;
   result.y = x1;
@@ -159,11 +204,9 @@ __forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack eight fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2, fp8_e5_t x3,
-                                                      fp8_e5_t x4, fp8_e5_t x5,
-                                                      fp8_e5_t x6,
-                                                      fp8_e5_t x7) {
+TL_DEVICE fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                     fp8_e5_t x6, fp8_e5_t x7) {
   fp8_e5_8_t result;
   result.x = make_fp8_e5_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e5_4_t(x4, x5, x6, x7);
@@ -171,11 +214,12 @@ __forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack sixteen fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_16_t
-make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
-                 fp8_e5_t x4, fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7,
-                 fp8_e5_t y0, fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
-                 fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6, fp8_e5_t y7) {
+TL_DEVICE fp8_e5_16_t make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                       fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                       fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t y0,
+                                       fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
+                                       fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6,
+                                       fp8_e5_t y7) {
   fp8_e5_16_t result;
   result.x = make_fp8_e5_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e5_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -183,7 +227,7 @@ make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
 }
 
 // Pack thirty-two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
+TL_DEVICE fp8_e5_32_t make_fp8_e5_32_t(
     fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3, fp8_e5_t x4,
     fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t x8, fp8_e5_t x9,
     fp8_e5_t x10, fp8_e5_t x11, fp8_e5_t x12, fp8_e5_t x13, fp8_e5_t x14,
@@ -198,3 +242,128 @@ __forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
                               y12, y13, y14, y15);
   return result;
 }
+
+// Pack two fp8_e8_t values.
+TL_DEVICE fp8_e8_2_t make_fp8_e8_2_t(fp8_e8_t x, fp8_e8_t y) {
+  fp8_e8_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp8_e8_t values.
+TL_DEVICE fp8_e8_4_t make_fp8_e8_4_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3) {
+  fp8_e8_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp8_e8_t values.
+TL_DEVICE fp8_e8_8_t make_fp8_e8_8_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                     fp8_e8_t x6, fp8_e8_t x7) {
+  fp8_e8_8_t result;
+  result.x = make_fp8_e8_4_t(x0, x1, x2, x3);
+  result.y = make_fp8_e8_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp8_e8_t values.
+TL_DEVICE fp8_e8_16_t make_fp8_e8_16_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                       fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                       fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t y0,
+                                       fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+                                       fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6,
+                                       fp8_e8_t y7) {
+  fp8_e8_16_t result;
+  result.x = make_fp8_e8_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp8_e8_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp8_e8_t values.
+TL_DEVICE fp8_e8_32_t make_fp8_e8_32_t(
+    fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2, fp8_e8_t x3, fp8_e8_t x4,
+    fp8_e8_t x5, fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t x8, fp8_e8_t x9,
+    fp8_e8_t x10, fp8_e8_t x11, fp8_e8_t x12, fp8_e8_t x13, fp8_e8_t x14,
+    fp8_e8_t x15, fp8_e8_t y0, fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+    fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6, fp8_e8_t y7, fp8_e8_t y8,
+    fp8_e8_t y9, fp8_e8_t y10, fp8_e8_t y11, fp8_e8_t y12, fp8_e8_t y13,
+    fp8_e8_t y14, fp8_e8_t y15) {
+  fp8_e8_32_t result;
+  result.x = make_fp8_e8_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp8_e8_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+// e4m3x2 -> float2
+TL_DEVICE float2
+__tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+  half2 tmp = __nv_cvt_fp8x2_to_halfraw2(x, fp8_interpretation);
+  float2 result;
+  result.x = (float)tmp.x;
+  result.y = (float)tmp.y;
+  return result;
+}
+
+// ============================================================================
+// FP8 E8M0 Related Conversions
+// ============================================================================
+#if TL_HAS_FP8_E8M0
+
+// fp8_e8m0 -> bfloat16
+TL_DEVICE __nv_bfloat16
+__tl_cvt_e8m0_to_bfloat16(const __nv_fp8_storage_t src) {
+  __nv_bfloat16_raw raw = __nv_cvt_e8m0_to_bf16raw(src);
+  return *reinterpret_cast<const __nv_bfloat16 *>(&raw);
+}
+
+// fp8_e8m0x2 -> bfloat16x2
+TL_DEVICE __nv_bfloat162
+__tl_cvt_e8m0x2_to_bfloat162(const __nv_fp8x2_storage_t src) {
+  __nv_bfloat162_raw raw = __nv_cvt_e8m0x2_to_bf162raw(src);
+  return *reinterpret_cast<const __nv_bfloat162 *>(&raw);
+}
+
+// bfloat16 -> fp8_e8m0
+TL_DEVICE
+__nv_fp8_storage_t __tl_cvt_bfloat16_to_e8m0(const __nv_bfloat16 src) {
+  __nv_bfloat16_raw raw = *reinterpret_cast<const __nv_bfloat16_raw *>(&src);
+  return __nv_cvt_bfloat16raw_to_e8m0(raw, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// bfloat162 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t
+__tl_cvt_bfloat162_to_e8m0x2(const __nv_bfloat162 src) {
+  __nv_bfloat162_raw raw = *reinterpret_cast<const __nv_bfloat162_raw *>(&src);
+  return __nv_cvt_bfloat162raw_to_e8m0x2(raw, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// float -> fp8_e8m0
+TL_DEVICE __nv_fp8_storage_t __tl_cvt_float_to_e8m0(const float src) {
+  return __nv_cvt_float_to_e8m0(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// float2 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t __tl_cvt_float2_to_e8m0x2(const float2 src) {
+  return __nv_cvt_float2_to_e8m0x2(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// double -> fp8_e8m0
+TL_DEVICE __nv_fp8_storage_t __tl_cvt_double_to_e8m0(const double src) {
+  return __nv_cvt_double_to_e8m0(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// double2 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t __tl_cvt_double2_to_e8m0x2(const double2 src) {
+  return __nv_cvt_double2_to_e8m0x2(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+#endif
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 7dbb31ea3..c832a5e5e 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -1,261 +1,121 @@
 #pragma once
 
+#if __CUDA_ARCH_LIST__ >= 890
 #include "./cuda_fp8.h"
-#include "common.h"
+#endif
 
+#include "common.h"
 #ifndef __CUDACC_RTC__
+#include <cstdint>
 #include <cstdio>
 #endif
 
-// Template declaration for device-side debug printing (variable only)
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Overload for pointer type (supports any cv-qualified T*)
-template <typename T> __device__ void debug_print_var(const char *msg, T *var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=pointer "
-      "value=%p\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, var);
-}
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for integer type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for half type
-template <> __device__ void debug_print_var<half>(const char *msg, half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_var<half_t>(const char *msg, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half_t "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
 
-// Specialization for bfloat16_t type
-template <>
-__device__ void debug_print_var<bfloat16_t>(const char *msg, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
 
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half, "half", "%f", float);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+#if __CUDA_ARCH_LIST__ >= 890
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+#endif
 
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_var<fp8_e4_t>(const char *msg, fp8_e4_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e4_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
-}
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
 
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_var<fp8_e5_t>(const char *msg, fp8_e5_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
-// Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned int>(const char *msg, const char *buf_name,
-                                       int index, unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for half type
-template <>
-__device__ void debug_print_buffer_value<half>(const char *msg,
-                                               const char *buf_name, int index,
-                                               half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for bfloat16_t type
-template <>
-__device__ void
-debug_print_buffer_value<bfloat16_t>(const char *msg, const char *buf_name,
-                                     int index, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
-// Specialization for fp8_e4_t type
 template <>
-__device__ void debug_print_buffer_value<fp8_e4_t>(const char *msg,
+__device__ void debug_print_buffer_value<uint16_t>(const char *msg,
                                                    const char *buf_name,
-                                                   int index, fp8_e4_t var) {
+                                                   int index, uint16_t var) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e4_t value=%f\n",
+         "index=%d, dtype=uint16_t value=%u\n",
          msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e5_t>(const char *msg,
-                                                   const char *buf_name,
-                                                   int index, fp8_e5_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e5_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for int16 type
-template <>
-__device__ void debug_print_buffer_value<int16_t>(const char *msg,
-                                                  const char *buf_name,
-                                                  int index, int16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int16_t value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (int32_t)var);
+         threadIdx.z, buf_name, index, (uint32_t)var);
 }
 
 TL_DEVICE void device_assert(bool cond) { assert(cond); }
@@ -266,3 +126,10 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
     assert(0);
   }
 }
+
+// Specialization for msg-only debug print
+__device__ void debug_print_msg(const char *msg) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d)\n", msg,
+         blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z);
+}
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
index c22854c0b..ea01fa9aa 100644
--- a/src/tl_templates/cuda/gemm_mma.h
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -8,7 +8,6 @@
 #include <cute/underscore.hpp>
 
 #include "common.h"
-#include "cuda_fp8.h"
 #include "intrin.h"
 
 namespace cute::tl_mma {
@@ -51,6 +50,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 1000
@@ -64,6 +64,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 900
@@ -76,6 +77,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 890
@@ -88,6 +90,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 800
@@ -96,6 +99,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 750
@@ -273,8 +277,8 @@ class GemmTensorOp {
                                 tfloat32_t, B_type_cute>::type;
   using C_type = C_type_raw;
 
-  using Instruction =
-      DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
+  using Instruction = DispatchInstruction<A_type_raw, B_type_raw, C_type_raw,
+                                          num_warp_m, num_warp_n, N>;
 
   using OperandATraits = OperandTraits<sizeof_bits<A_type>::value, M, K,
                                        !trans_A, num_warp_m, lda>;
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 856d37dd1..84e22f24e 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,47 +243,99 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
           typename C_type_raw>
diff --git a/src/tl_templates/cuda/gemm_sp_sm90.h b/src/tl_templates/cuda/gemm_sp_sm90.h
index 6184f9be7..522fc11ee 100644
--- a/src/tl_templates/cuda/gemm_sp_sm90.h
+++ b/src/tl_templates/cuda/gemm_sp_sm90.h
@@ -231,4 +231,4 @@ TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
     CUTE_GCC_UNREACHABLE;
   }
 }
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
index ed561285f..869fa777b 100644
--- a/src/tl_templates/cuda/instruction/mma.h
+++ b/src/tl_templates/cuda/instruction/mma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm80.hpp>
 #include <cute/arch/mma_sm89.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/mma_sm70.h b/src/tl_templates/cuda/instruction/mma_sm70.h
index 656741752..7a44b9212 100644
--- a/src/tl_templates/cuda/instruction/mma_sm70.h
+++ b/src/tl_templates/cuda/instruction/mma_sm70.h
@@ -2,8 +2,10 @@
 
 #include "../common.h"
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/tcgen05mma.h b/src/tl_templates/cuda/instruction/tcgen05mma.h
index 9772d6438..d69e8326a 100644
--- a/src/tl_templates/cuda/instruction/tcgen05mma.h
+++ b/src/tl_templates/cuda/instruction/tcgen05mma.h
@@ -85,7 +85,7 @@ TL_DEVICE void tcgen05mma_ts<DataType::kTensorFloat32>(
   }
 }
 
-// INT8 instruction kind
+// INT8 instruction kind (maps to kind::i8)
 template <>
 TL_DEVICE void tcgen05mma_ts<DataType::kInt8>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
@@ -104,6 +104,16 @@ TL_DEVICE void tcgen05mma_ts<DataType::kInt8>(
   }
 }
 
+// UINT8 maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kUInt8>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kInt8>(tmem_a, desc_b, tmem_c, scalec, desc_val,
+                                 mask0, mask1, mask2, mask3);
+}
+
 // FP8 family instruction kind (maps to f8f6f4)
 template <>
 TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3>(
@@ -183,7 +193,7 @@ TL_DEVICE void tcgen05mma_ss<DataType::kTensorFloat32>(
   }
 }
 
-// INT8 instruction kind
+// INT8 instruction kind (maps to kind::i8)
 template <>
 TL_DEVICE void tcgen05mma_ss<DataType::kInt8>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
@@ -202,6 +212,16 @@ TL_DEVICE void tcgen05mma_ss<DataType::kInt8>(
   }
 }
 
+// UINT8 maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kUInt8>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kInt8>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                 mask0, mask1, mask2, mask3);
+}
+
 // FP8 family instruction kind (maps to f8f6f4)
 template <>
 TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3>(
@@ -307,6 +327,16 @@ TL_DEVICE void tcgen05mma_ws_ss<DataType::kInt8>(
   }
 }
 
+// UINT8 ws, maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kUInt8>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kInt8>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                    mask0, mask1, mask2, mask3);
+}
+
 // FP8 ws (maps to f8f6f4)
 template <>
 TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
diff --git a/src/tl_templates/cuda/instruction/wgmma.h b/src/tl_templates/cuda/instruction/wgmma.h
index b5ef59c26..3af2d79fe 100644
--- a/src/tl_templates/cuda/instruction/wgmma.h
+++ b/src/tl_templates/cuda/instruction/wgmma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm90_gmma.hpp>
 #include <cute/arch/mma_sm90_gmma_ext.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/intrin.h b/src/tl_templates/cuda/intrin.h
index 0d5b5639d..7d585fa7b 100644
--- a/src/tl_templates/cuda/intrin.h
+++ b/src/tl_templates/cuda/intrin.h
@@ -101,8 +101,9 @@ template <int thread_extent> TL_DEVICE bool tl_shuffle_elect() {
     //   (1) We are in warp 0 of the block.
     //   (2) We are the elected lane in this warp.
     return cutlass::canonical_warp_idx_sync() == 0 && cute::elect_one_sync();
+  } else if constexpr (thread_extent == 32) {
+    return cute::elect_one_sync();
   }
-
   // General case: thread_extent != 0
   // (threadIdx.x / 32) is the warp index in the block.
   // (thread_extent / 32) is the number of warps in one group of size
diff --git a/src/tl_templates/cuda/ldsm.h b/src/tl_templates/cuda/ldsm.h
index 4d6af8a09..a20746dff 100644
--- a/src/tl_templates/cuda/ldsm.h
+++ b/src/tl_templates/cuda/ldsm.h
@@ -118,4 +118,4 @@ TL_DEVICE void ptx_stmatrix_x4_trans(void const *const smem_ptr,
                "r"(value0), "r"(value1), "r"(value2), "r"(value3));
 }
 
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/nvrtc_std.h b/src/tl_templates/cuda/nvrtc_std.h
index 9930c2200..34cd58bb2 100644
--- a/src/tl_templates/cuda/nvrtc_std.h
+++ b/src/tl_templates/cuda/nvrtc_std.h
@@ -19,6 +19,11 @@
 
 #ifdef __CUDACC_RTC__
 
+// Disable problematic CUDA standard library headers in NVRTC environment
+// Vector types (float4, uchar, etc.) are built-in to NVRTC and don't need these
+// headers
+#define _LIBCUDACXX___TUPLE_VECTOR_TYPES_H // Prevent vector_types.h inclusion
+
 using int8_t = signed char;
 using uint8_t = unsigned char;
 using int16_t = signed short;
@@ -67,6 +72,24 @@ template <class T> struct is_same<T, T> : true_type {};
 template <class T, class U>
 inline constexpr bool is_same_v = is_same<T, U>::value;
 
+template <class T> struct is_void : false_type {};
+
+template <> struct is_void<void> : true_type {};
+template <> struct is_void<const void> : true_type {};
+template <> struct is_void<volatile void> : true_type {};
+template <> struct is_void<const volatile void> : true_type {};
+
+template <class T> inline constexpr bool is_void_v = is_void<T>::value;
+
+template <class T> struct is_pointer : false_type {};
+
+template <class T> struct is_pointer<T *> : true_type {};
+template <class T> struct is_pointer<T *const> : true_type {};
+template <class T> struct is_pointer<T *volatile> : true_type {};
+template <class T> struct is_pointer<T *const volatile> : true_type {};
+
+template <class T> inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
 namespace index_sequence_impl {
 
 // Based on https://stackoverflow.com/a/32223343/11717224
@@ -118,6 +141,36 @@ template <bool B, class T = void> struct enable_if {};
 template <class T> struct enable_if<true, T> {
   using type = T;
 };
+
+template <class T> struct remove_extent {
+  using type = T;
+};
+
+template <class T> struct remove_extent<T[]> {
+  using type = T;
+};
+
+template <class T, size_t N> struct remove_extent<T[N]> {
+  using type = T;
+};
+
+template <class T> using remove_extent_t = typename remove_extent<T>::type;
+
+template <class T, unsigned I = 0>
+struct extent : integral_constant<size_t, 0> {};
+
+template <class T> struct extent<T[], 0> : integral_constant<size_t, 0> {};
+
+template <class T, unsigned I> struct extent<T[], I> : extent<T, I - 1> {};
+
+template <class T, size_t N>
+struct extent<T[N], 0> : integral_constant<size_t, N> {};
+
+template <class T, size_t N, unsigned I>
+struct extent<T[N], I> : extent<T, I - 1> {};
+
+template <class T, unsigned I = 0>
+inline constexpr size_t extent_v = extent<T, I>::value;
 } // namespace std
 
-#endif
\ No newline at end of file
+#endif // __CUDACC_RTC__
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 0009b9b99..55b8878b7 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -1,8 +1,11 @@
 #pragma once
 
 #include "common.h"
+
+#ifndef __CUDACC_RTC__
 #include <cstdint>
 #include <type_traits>
+#endif
 
 namespace tl {
 
@@ -172,15 +175,15 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   static_assert(threads == 1024 or threads == 512 or threads == 256 or
                 threads == 128 or threads == 64 or threads == 32);
   template <typename T, int SEG = 32>
-  static TL_DEVICE T run(const T *__restrict__ src, T *__restrict__ dst, int H,
-                         int W) {
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
 
     constexpr int TILE_H = threads / SEG;
     constexpr unsigned MASK = 0xffffffff;
     const int num_blocks = (H + TILE_H - 1) / TILE_H;
     const int tid = threadIdx.x;
-    const int lane = tid % 32;
-    const int row = tid / 32;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
 
     for (int b = 0; b < num_blocks; ++b) {
       const int gRow = b * TILE_H + row;
@@ -247,4 +250,35 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   }
 };
 
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  constexpr uint32_t mask = 0xffffffff;
+  value = op(value, __shfl_xor_sync(mask, value, 16));
+  value = op(value, __shfl_xor_sync(mask, value, 8));
+  value = op(value, __shfl_xor_sync(mask, value, 4));
+  value = op(value, __shfl_xor_sync(mask, value, 2));
+  value = op(value, __shfl_xor_sync(mask, value, 1));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index b2eb2f816..9e5e34206 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,7 +10,9 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-class tmem_ld_32dp32bNx {
+template <bool Pack16> class tmem_ld_32dp32bNx;
+
+template <> class tmem_ld_32dp32bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -180,9 +182,180 @@ class tmem_ld_32dp32bNx {
     }
   }
 };
+template <> class tmem_ld_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-class tmem_ld_16dp64bNx {
+template <bool Pack16> class tmem_ld_16dp64bNx;
+template <> class tmem_ld_16dp64bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -352,39 +525,43 @@ class tmem_ld_16dp64bNx {
     }
   }
 };
-
-// 16 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_16dp128bNx {
+template <> class tmem_ld_16dp64bNx<true> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -395,9 +572,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -414,9 +591,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -449,9 +626,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 128) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -519,32 +696,39 @@ class tmem_ld_16dp128bNx {
   }
 };
 
-// 16 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_16dp256bNx {
+// 16 data path lanes, 128-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp128bNx;
+template <> class tmem_ld_16dp128bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -555,9 +739,9 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -574,9 +758,9 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -609,9 +793,492 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp256bNx;
+template <> class tmem_ld_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -681,32 +1348,32 @@ class tmem_ld_16dp256bNx {
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-class tmem_ld_32dp64bNx {
+template <bool Pack16 = false> class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_32dp128bNx {
+template <bool Pack16 = false> class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_32dp256bNx {
+template <bool Pack16 = false> class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/src/tl_templates/hip/atomic.h b/src/tl_templates/hip/atomic.h
new file mode 100644
index 000000000..30931361b
--- /dev/null
+++ b/src/tl_templates/hip/atomic.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ T1 AtomicAddRet(T1 *ref, T2 val,
+                                           int memory_order = 0) {
+  return atomicAdd(ref, static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+__forceinline__ __device__ void AtomicAddx2(float *ref, float *val,
+                                            int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float2 AtomicAddx2Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ void AtomicAddx4(float *ref, float *val,
+                                            int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float4 AtomicAddx4Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index b00944a18..186e7dfb2 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include "atomic.h"
 #include <ck_tile/core.hpp>
+#include <hip/amd_detail/amd_warp_functions.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -105,17 +107,94 @@ TL_DEVICE unsigned __pack_bfloat162(const bfloat16_t x, const bfloat16_t y) {
   return (v1 << 16) | v0;
 }
 
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 *address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+namespace tl {
+
+// Any
+template <typename T> TL_DEVICE bool Any(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (a[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// All
+template <typename T> TL_DEVICE bool All(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(gong): support shfl_sync(rocm 7.1.1 provide shfl_sync)
+// shfl_sync func
+template <typename T> TL_DEVICE T shfl_xor(T val, int delta) {
+  return __shfl_xor(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_down(T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_up(T val, int delta) {
+  return __shfl_up(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl(T val, int srcLane) {
+  return __shfl(val, srcLane);
+}
+
+// specialize half_t
+template <> TL_DEVICE half_t shfl_xor(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_down(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_up(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl(half_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return half_t(r);
+}
+
+// specialize bfloat16_t
+template <> TL_DEVICE bfloat16_t shfl_xor(bfloat16_t val, int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, laneMask);
+  return bfloat16_t(r);
 }
 
-// Overload for when the first argument is a value instead of a pointer
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+template <> TL_DEVICE bfloat16_t shfl_down(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return bfloat16_t(r);
 }
 
-template <typename T1, typename T2> TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val) {
-  return atomicAdd(&ref, static_cast<T1>(val));
+template <> TL_DEVICE bfloat16_t shfl_up(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return bfloat16_t(r);
 }
+
+template <> TL_DEVICE bfloat16_t shfl(bfloat16_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return bfloat16_t(r);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/hip/copy.h b/src/tl_templates/hip/copy.h
index 3ba334da8..3f122d801 100644
--- a/src/tl_templates/hip/copy.h
+++ b/src/tl_templates/hip/copy.h
@@ -73,33 +73,35 @@ CK_TILE_DEVICE void async_buffer_load_dword_v(void *smem, int32x4_t rsrc,
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void *lds_base_ptr, void *global_base_ptr) {
+TL_DEVICE void cp_async_gs(void *lds_base_ptr, void const *global_base_ptr) {
   if constexpr (N == 16) {
-    *(uint4 *)lds_base_ptr = *(uint4 *)global_base_ptr;
+    *(uint4 *)lds_base_ptr = *(const uint4 *)global_base_ptr;
   } else if constexpr (N == 8) {
-    *(uint2 *)lds_base_ptr = *(uint2 *)global_base_ptr;
+    *(uint2 *)lds_base_ptr = *(const uint2 *)global_base_ptr;
   } else if constexpr (N == 4) {
     async_buffer_load_dword_v(
         lds_base_ptr,
-        make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+        make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                  threadIdx.x),
         threadIdx.x * N /*assume 4 bytes*/);
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void *lds_base_ptr,
-                                       void *global_base_ptr, bool cond) {
+                                       void const *global_base_ptr, bool cond) {
   if constexpr (N == 16) {
     *(uint4 *)lds_base_ptr =
-        cond ? *(uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
+        cond ? *(const uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
   } else if constexpr (N == 8) {
     *(uint2 *)lds_base_ptr =
-        cond ? *(uint2 *)global_base_ptr : make_uint2(0, 0);
+        cond ? *(const uint2 *)global_base_ptr : make_uint2(0, 0);
   } else {
     if (cond) {
       async_buffer_load_dword_v(
           lds_base_ptr,
-          make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+          make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                    threadIdx.x),
           threadIdx.x * N /*assume 4 bytes*/);
     } else {
       *(uint4 *)lds_base_ptr = make_uint4(0, 0, 0, 0);
diff --git a/src/tl_templates/hip/debug.h b/src/tl_templates/hip/debug.h
index 7b19d3e94..309b8fd99 100644
--- a/src/tl_templates/hip/debug.h
+++ b/src/tl_templates/hip/debug.h
@@ -1,191 +1,108 @@
 #pragma once
 #include <hip/hip_runtime.h>
 
-// Base template declaration
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for int type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for unsigned int type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned int value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for bool type
-template <> __device__ void debug_print_var<bool>(const char *msg, bool var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
-         "value=%s\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z,
-         var ? "true" : "false");
-}
-
-// Specialization for short type
-template <> __device__ void debug_print_var<short>(const char *msg, short var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=short "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned short type
-template <>
-__device__ void debug_print_var<unsigned short>(const char *msg,
-                                                unsigned short var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned short value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
+#include "hip_fp8.h"
+
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
+
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
+
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+
+//
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
+
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
 // Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  float value = static_cast<float>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
+// Specialization for msg-only debug print
+__device__ void debug_print_msg(const char *msg) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d)\n", msg,
+         blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z);
 }
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 0000745b5..326785490 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -1,40 +1,146 @@
+#pragma once
 #include <hip/amd_detail/amd_hip_fp8.h>
+#include <stdint.h>
 
 #define HIP_FP8_ENABLED 1
 
-using fp8_e4_t = __hip_fp8_e4m3_fnuz;
-using fp8_e4_2_t = __hip_fp8x2_e4m3_fnuz;
+#define TILELANG_FP8_E4M3_VARIANT_FN 0
+#define TILELANG_FP8_E4M3_VARIANT_FNUZ 1
+
+#define TILELANG_FP8_E5M2_VARIANT_FN 0
+#define TILELANG_FP8_E5M2_VARIANT_FNUZ 1
+
+#ifndef TILELANG_FP8_E4M3_VARIANT
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define TILELANG_FP8_E4M3_VARIANT TILELANG_FP8_E4M3_VARIANT_FNUZ
+#else
+#define TILELANG_FP8_E4M3_VARIANT TILELANG_FP8_E4M3_VARIANT_FN
+#endif
+#endif
+
+#ifndef TILELANG_FP8_E5M2_VARIANT
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define TILELANG_FP8_E5M2_VARIANT TILELANG_FP8_E5M2_VARIANT_FNUZ
+#else
+#define TILELANG_FP8_E5M2_VARIANT TILELANG_FP8_E5M2_VARIANT_FN
+#endif
+#endif
+
+#if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FN)
+#if defined(__clang__) && defined(__HIPCC__)
+#if !__is_identifier(__hip_fp8_e4m3)
+#define TILELANG_HAVE_FP8_E4M3_FN 1
+#endif
+#endif
+#endif
+
+#if defined(TILELANG_HAVE_FP8_E4M3_FN)
+using hip_fp8_e4_t = __hip_fp8_e4m3;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3;
+#else
+// FNUZ path (MI300X and universal fallback)
+using hip_fp8_e4_t = __hip_fp8_e4m3_fnuz;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3_fnuz;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3_fnuz;
+#endif
+
+#if (TILELANG_FP8_E5M2_VARIANT == TILELANG_FP8_E5M2_VARIANT_FN)
+#if defined(__clang__) && defined(__HIPCC__)
+#if !__is_identifier(__hip_fp8_e5m2)
+#define TILELANG_HAVE_FP8_E5M2_FN 1
+#endif
+#endif
+#endif
+
+#if defined(TILELANG_HAVE_FP8_E5M2_FN)
+using hip_fp8_e5_t = __hip_fp8_e5m2;
+using hip_fp8x2_e5_t = __hip_fp8x2_e5m2;
+using hip_fp8x4_e5_t = __hip_fp8x4_e5m2;
+#else
+using hip_fp8_e5_t = __hip_fp8_e5m2_fnuz;
+using hip_fp8x2_e5_t = __hip_fp8x2_e5m2_fnuz;
+using hip_fp8x4_e5_t = __hip_fp8x4_e5m2_fnuz;
+#endif
+
+struct fp8_e4_t {
+  unsigned char data;
+  __device__ fp8_e4_t() {}
+  __device__ fp8_e4_t(hip_fp8_e4_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ fp8_e4_t(float val) {
+    constexpr __hip_fp8_interpretation_t interp =
+#if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FNUZ)
+        __HIP_E4M3_FNUZ;
+#else
+        __HIP_E4M3;
+#endif
+    data = __hip_cvt_float_to_fp8(val, __HIP_SATFINITE, interp);
+  }
+  __device__ operator hip_fp8_e4_t() const {
+    return *reinterpret_cast<const hip_fp8_e4_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e4_t>(*this));
+  }
+};
+
+using fp8_e4_2_t = hip_fp8x2_e4_t;
+using fp8_e4_4_storage_t = uint32_t;
 
 // Additional FP8 types for compatibility
-using fp8_e5_t = __hip_fp8_e5m2_fnuz;
-using fp8_e5_2_t = __hip_fp8x2_e5m2_fnuz;
+using fp8_e5_2_t = hip_fp8x2_e5_t;
+
+struct fp8_e5_t {
+  unsigned char data;
+  __device__ fp8_e5_t() {}
+  __device__ fp8_e5_t(hip_fp8_e5_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ fp8_e5_t(float val) {
+    constexpr __hip_fp8_interpretation_t interp =
+#if (TILELANG_FP8_E5M2_VARIANT == TILELANG_FP8_E5M2_VARIANT_FNUZ)
+        __HIP_E5M2_FNUZ;
+#else
+        __HIP_E5M2;
+#endif
+    data = __hip_cvt_float_to_fp8(val, __HIP_SATFINITE, interp);
+  }
+  __device__ operator hip_fp8_e5_t() const {
+    return *reinterpret_cast<const hip_fp8_e5_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e5_t>(*this));
+  }
+};
 // Note: E8M0 types are not supported in current HIP version
 // using fp8_e8_t = __hip_fp8_e8m0_fnuz;
 // using fp8_e8_2_t = __hip_fp8x2_e8m0_fnuz;
 
 // Simple wrapper that provides member access for generated code
-struct fp8_e4_4_t {
+struct __align__(4) fp8_e4_4_t {
   union {
-    __hip_fp8x4_e4m3_fnuz data;
+    fp8_e4_4_storage_t data;
     struct {
-      fp8_e4_t x, y, z, w;
+      fp8_e4_t x;
+      fp8_e4_t y;
+      fp8_e4_t z;
+      fp8_e4_t w;
     };
   };
 
-  // Default constructor
-  __device__ fp8_e4_4_t() = default;
-
-  // Constructor from __hip_fp8x4_e4m3_fnuz
-  __device__ fp8_e4_4_t(const __hip_fp8x4_e4m3_fnuz &val) : data(val) {}
-
-  // Constructor from float4
-  __device__ fp8_e4_4_t(const float4 &val) : data(val) {}
+  __device__ fp8_e4_4_t() {}
+  __device__ fp8_e4_4_t(const fp8_e4_4_storage_t &val) : data(val) {}
+  __device__ fp8_e4_4_t(const hip_fp8x4_e4_t &val) {
+    data = *reinterpret_cast<const fp8_e4_4_storage_t *>(&val);
+  }
 
-  // Conversion operator to __hip_fp8x4_e4m3_fnuz
-  __device__ operator __hip_fp8x4_e4m3_fnuz() const { return data; }
+  __device__ operator hip_fp8x4_e4_t() const {
+    return *reinterpret_cast<const hip_fp8x4_e4_t *>(&data);
+  }
 
-  // Assignment operator
-  __device__ fp8_e4_4_t &operator=(const __hip_fp8x4_e4m3_fnuz &val) {
+  __device__ fp8_e4_4_t &operator=(const fp8_e4_4_storage_t &val) {
     data = val;
     return *this;
   }
@@ -51,16 +157,25 @@ struct __align__(16) fp8_e4_16_t {
 };
 
 // FP8 E5M2 vector types
-struct fp8_e5_4_t {
+using fp8_e5_4_storage_t = uint32_t;
+
+struct __align__(4) fp8_e5_4_t {
   union {
-    __hip_fp8x4_e5m2_fnuz data;
+    fp8_e5_4_storage_t data;
     struct {
-      fp8_e5_t x, y, z, w;
+      fp8_e5_t x;
+      fp8_e5_t y;
+      fp8_e5_t z;
+      fp8_e5_t w;
     };
   };
-  __device__ fp8_e5_4_t() = default;
-  __device__ fp8_e5_4_t(const __hip_fp8x4_e5m2_fnuz &val) : data(val) {}
-  __device__ operator __hip_fp8x4_e5m2_fnuz() const { return data; }
+  __device__ fp8_e5_4_t() {}
+  __device__ fp8_e5_4_t(const hip_fp8x4_e5_t &val) {
+    data = *reinterpret_cast<const fp8_e5_4_storage_t *>(&val);
+  }
+  __device__ operator hip_fp8x4_e5_t() const {
+    return *reinterpret_cast<const hip_fp8x4_e5_t *>(&data);
+  }
 };
 
 struct __align__(8) fp8_e5_8_t {
@@ -127,3 +242,41 @@ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
   res.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
   return res;
 }
+
+__device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                        fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                        fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                        fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                        fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                        fp8_e4_t y7) {
+  signed char x0_char = *reinterpret_cast<signed char *>(&x0);
+  signed char x1_char = *reinterpret_cast<signed char *>(&x1);
+  signed char x2_char = *reinterpret_cast<signed char *>(&x2);
+  signed char x3_char = *reinterpret_cast<signed char *>(&x3);
+  signed char x4_char = *reinterpret_cast<signed char *>(&x4);
+  signed char x5_char = *reinterpret_cast<signed char *>(&x5);
+  signed char x6_char = *reinterpret_cast<signed char *>(&x6);
+  signed char x7_char = *reinterpret_cast<signed char *>(&x7);
+  signed char y0_char = *reinterpret_cast<signed char *>(&y0);
+  signed char y1_char = *reinterpret_cast<signed char *>(&y1);
+  signed char y2_char = *reinterpret_cast<signed char *>(&y2);
+  signed char y3_char = *reinterpret_cast<signed char *>(&y3);
+  signed char y4_char = *reinterpret_cast<signed char *>(&y4);
+  signed char y5_char = *reinterpret_cast<signed char *>(&y5);
+  signed char y6_char = *reinterpret_cast<signed char *>(&y6);
+  signed char y7_char = *reinterpret_cast<signed char *>(&y7);
+  int a = (x3_char << 24) | (x2_char << 16) | (x1_char << 8) | x0_char;
+  int b = (x7_char << 24) | (x6_char << 16) | (x5_char << 8) | x4_char;
+  int c = (y3_char << 24) | (y2_char << 16) | (y1_char << 8) | y0_char;
+  int d = (y7_char << 24) | (y6_char << 16) | (y5_char << 8) | y4_char;
+  fp8_e4_8_t res_x;
+  res_x.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
+  res_x.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
+  fp8_e4_8_t res_y;
+  res_y.x = *reinterpret_cast<fp8_e4_4_t *>(&c);
+  res_y.y = *reinterpret_cast<fp8_e4_4_t *>(&d);
+  fp8_e4_16_t res;
+  res.x = res_x;
+  res.y = res_y;
+  return res;
+}
diff --git a/src/tl_templates/hip/ldsm.h b/src/tl_templates/hip/ldsm.h
index 68c1455f7..286b77324 100644
--- a/src/tl_templates/hip/ldsm.h
+++ b/src/tl_templates/hip/ldsm.h
@@ -1,3 +1,3 @@
 #pragma once
 
-#include "common.h"
\ No newline at end of file
+#include "common.h"
diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
index 16c51b648..7185585ee 100644
--- a/src/tl_templates/hip/reduce.h
+++ b/src/tl_templates/hip/reduce.h
@@ -73,7 +73,7 @@ struct SharedReduceWarp {
       }
 
       for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down(partial, offset, kWarpSize);
+        T other = tl::shfl_down(partial, offset, kWarpSize);
         partial = Reducer()(partial, other);
       }
 
@@ -104,7 +104,7 @@ struct AllReduce {
       __syncthreads();
       x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
     } else {
-      x = Reducer()(x, __shfl_xor(x, offset));
+      x = Reducer()(x, tl::shfl_xor(x, offset));
     }
     if constexpr (offset == scale) {
       return x;
@@ -114,4 +114,180 @@ struct AllReduce {
   }
 };
 
+template <int threads, bool reverse = false> struct CumSum1D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int N) {
+    if (N <= 0)
+      return;
+
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+
+    if (tid >= SEG)
+      return;
+
+    T carry = (T)0;
+
+    if (reverse) {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = num_segments - 1; seg >= 0; --seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_down(val, off);
+          if (lane < SEG - off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, 0);
+        if (lane == 0)
+          carry = segSum;
+        carry = tl::shfl(carry, 0);
+      }
+    } else {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = 0; seg < num_segments; ++seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_up(val, off);
+          if (lane >= off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, SEG - 1);
+        if (lane == SEG - 1)
+          carry = segSum;
+        carry = tl::shfl(carry, SEG - 1);
+      }
+    }
+  }
+};
+
+template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
+
+    constexpr int TILE_H = threads / SEG;
+    const int num_blocks = (H + TILE_H - 1) / TILE_H;
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
+
+    for (int b = 0; b < num_blocks; ++b) {
+      const int gRow = b * TILE_H + row;
+      if (gRow >= H)
+        return;
+
+      T carry = (T)0;
+
+      if (reverse) {
+        // Start from the last segment for reverse mode
+        for (int seg = (W + SEG - 1) / SEG - 1; seg >= 0; --seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_down(val, off);
+            if (lane < SEG - off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, 0);
+          if (lane == 0)
+            carry = segSum;
+          carry = tl::shfl(carry, 0);
+        }
+      } else {
+        for (int seg = 0; seg * SEG < W; ++seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_up(val, off);
+            if (lane >= off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, SEG - 1);
+          if (lane == SEG - 1)
+            carry = segSum;
+          carry = tl::shfl(carry, SEG - 1);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  value = op(value, __shfl_xor(value, 32));
+  value = op(value, __shfl_xor(value, 16));
+  value = op(value, __shfl_xor(value, 8));
+  value = op(value, __shfl_xor(value, 4));
+  value = op(value, __shfl_xor(value, 2));
+  value = op(value, __shfl_xor(value, 1));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/src/transform/annotate_read_only_params.cc b/src/transform/annotate_read_only_params.cc
new file mode 100644
index 000000000..e9eef683b
--- /dev/null
+++ b/src/transform/annotate_read_only_params.cc
@@ -0,0 +1,191 @@
+/*!
+ * \file annotate_read_only_params.cc
+ * \brief Annotate PrimFunc parameters that are read-only (never written).
+ */
+
+#include <string>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+using namespace tir;
+using namespace ffi;
+
+/*!
+ * \brief A simple visitor that marks handle parameters as written when they
+ *        appear on the LHS of a BufferStore or in a tvm_access_ptr with write
+ * flag.
+ */
+class ReadWriteMarker : public StmtExprVisitor {
+public:
+  explicit ReadWriteMarker(
+      const std::unordered_set<const VarNode *> &param_or_data_vars)
+      : param_or_data_vars_(param_or_data_vars) {}
+
+  const std::unordered_set<const VarNode *> &written() const {
+    return written_;
+  }
+
+  // Try to resolve the underlying buffer data Var from a pointer-like
+  // argument. Supports:
+  //  - address_of(BufferLoad(...)) -> returns buffer->data
+  //  - BufferLoad(...)             -> returns buffer->data
+  // Otherwise returns nullptr.
+  const VarNode *ResolveDataVarFromPtrArg(const PrimExpr &arg) const {
+    if (const auto *call = arg.as<CallNode>()) {
+      if (call->op.same_as(builtin::address_of())) {
+        if (call->args.size() == 1U) {
+          if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+            return load->buffer->data.get();
+          }
+        }
+      }
+    } else if (const auto *load = arg.as<BufferLoadNode>()) {
+      return load->buffer->data.get();
+    }
+    return nullptr;
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    const VarNode *data = op->buffer->data.get();
+    if (param_or_data_vars_.count(data)) {
+      written_.insert(data);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Detect tvm_access_ptr writes. Be conservative if rw_mask is non-constant.
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() == 5U) {
+        if (const VarNode *buf = op->args[1].as<VarNode>()) {
+          const IntImmNode *flag = op->args[4].as<IntImmNode>();
+          bool maybe_write = true; // default conservative
+          if (flag) {
+            maybe_write = (flag->value & 2) != 0; // write bit set
+          }
+          if (maybe_write && param_or_data_vars_.count(buf)) {
+            written_.insert(buf);
+          }
+        }
+      }
+    } else {
+      // Generic fallback: mark buffers that appear as
+      // address_of(BufferLoad(...)) in call arguments as written. This matches
+      // patterns like
+      //   tl.tma_store(address_of(smem[..]), address_of(gmem[..]), ...)
+      //   call_extern("AtomicAdd*", address_of(gmem[..]), ...)
+      // and avoids over-marking plain BufferLoad used for reads.
+      for (const PrimExpr &a : op->args) {
+        if (const auto *c = a.as<CallNode>()) {
+          if (c->op.same_as(builtin::address_of()) && c->args.size() == 1U) {
+            if (const auto *bl = c->args[0].as<BufferLoadNode>()) {
+              const VarNode *data = bl->buffer->data.get();
+              if (param_or_data_vars_.count(data)) {
+                written_.insert(data);
+              }
+            }
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+private:
+  std::unordered_set<const VarNode *> param_or_data_vars_;
+  std::unordered_set<const VarNode *> written_;
+};
+
+/*!
+ * \brief Annotate PrimFunc with indices of read-only handle parameters.
+ *
+ * Adds an Array<Integer> attribute "tl.readonly_param_indices" that lists
+ * parameter indices which correspond to handle parameters that are never
+ * written inside the function body. This can be used by codegen to emit
+ * `const` qualifiers to enable read-only caching (e.g., __ldg on CUDA).
+ */
+static tir::PrimFunc MarkReadOnlyParams(tir::PrimFunc f) {
+  // Gather handle params and their corresponding buffer data vars (aliases).
+  std::unordered_set<const VarNode *> param_or_data_vars;
+  // Map back from data var to parameter index for result attribution.
+  std::unordered_map<const VarNode *, size_t> data_var_to_param_idx;
+
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &p = f->params[i];
+    if (!p->dtype.is_handle())
+      continue;
+    param_or_data_vars.insert(p.get());
+    // If there is a buffer_map entry for this param, include its data var too.
+    if (auto opt = f->buffer_map.Get(p)) {
+      const VarNode *data = opt.value()->data.get();
+      param_or_data_vars.insert(data);
+      data_var_to_param_idx[data] = i;
+    }
+  }
+  if (param_or_data_vars.empty())
+    return f;
+
+  ReadWriteMarker marker(param_or_data_vars);
+  marker(f->body);
+
+  // Determine read-only parameter indices among all params (handle only)
+  Array<Integer> readonly_indices;
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &v = f->params[i];
+    if (!v->dtype.is_handle())
+      continue;
+
+    bool is_written = false;
+    // Direct param var written?
+    if (marker.written().count(v.get())) {
+      is_written = true;
+    } else {
+      // Or any aliased data var written?
+      if (auto opt = f->buffer_map.Get(v)) {
+        if (marker.written().count(opt.value()->data.get())) {
+          is_written = true;
+        }
+      }
+    }
+
+    if (!is_written) {
+      readonly_indices.push_back(Integer(static_cast<int>(i)));
+    }
+  }
+
+  if (!readonly_indices.empty()) {
+    Map<String, Any> attrs;
+    attrs.Set(String("tl.readonly_param_indices"), readonly_indices);
+    f = WithAttrs(std::move(f), attrs);
+  }
+  return f;
+}
+
+namespace transform {
+using namespace tir::transform;
+
+Pass AnnotateReadOnlyParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &m,
+                      const tvm::transform::PassContext &ctx) {
+    return MarkReadOnlyParams(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateReadOnlyParams", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AnnotateReadOnlyParams",
+                        AnnotateReadOnlyParams);
+}
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 7df6d0cc8..e0c365681 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file arg_binder.cc
  * \brief Helper utility to match and bind arguments.
@@ -24,13 +5,21 @@
 #include "arg_binder.h"
 
 #include <tvm/runtime/device_api.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
 #include <sstream>
+#include <unordered_set>
 
+#include "../runtime/error_helpers.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/arith/int_solver.h"
+#include "tvm/ffi/cast.h"
+#include "tvm/ffi/container/array.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/stmt_functor.h"
 
 namespace tvm {
 namespace tl {
@@ -38,17 +27,157 @@ namespace tl {
 using namespace tir;
 
 void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
-                     const std::string &arg_name, std::vector<Stmt> *asserts) {
+                     const std::string &arg_name, std::vector<Stmt> *asserts,
+                     PrimExpr nullable_guard = PrimExpr()) {
   PrimExpr scond = ana->Simplify(cond);
   if (is_zero(scond)) {
     LOG(FATAL) << "Bind have an unmet assertion: " << cond << ", "
                << " on argument " << arg_name;
   }
+
   if (!is_one(scond)) {
-    std::ostringstream os;
-    os << "Argument " << arg_name << " has an unsatisfied constraint: " << cond;
-    asserts->emplace_back(AssertStmt(scond, StringImm(os.str()), Evaluate(0)));
+    // Extract kernel/buffer/field from arg_name (e.g., "main.A.shape[0]")
+    std::string kernel = arg_name;
+    std::string buf_and_field = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel = arg_name.substr(0, dot_pos);
+      buf_and_field = arg_name.substr(dot_pos + 1);
+    }
+    std::string buffer = buf_and_field;
+    std::string field;
+    size_t dot2 = buf_and_field.find('.');
+    if (dot2 != std::string::npos) {
+      buffer = buf_and_field.substr(0, dot2);
+      field = buf_and_field.substr(dot2 + 1);
+    }
+
+    // If cond is an equality, prefer structured packed error with expect/got
+    if (const auto *eq = scond.as<tvm::tir::EQNode>()) {
+      PrimExpr lhs = eq->a;
+      PrimExpr rhs = eq->b;
+      // Choose rhs as expected and lhs as got for better semantics in most
+      // binding cases
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_expect_eq));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      pargs.push_back(cast(DataType::Int(64), rhs)); // expected
+      pargs.push_back(cast(DataType::Int(64), lhs)); // got
+
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      // Only emit at runtime when the equality fails
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    } else {
+      // Fallback: packed generic constraint violation without dumping cond
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_constraint_violation));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    }
+  }
+}
+
+std::vector<Var> ArgBinder::getUndefVars(const std::vector<PrimExpr> &args) {
+  std::unordered_set<const VarNode *> visit;
+  std::vector<Var> res;
+  for (const auto &arg : args) {
+    PostOrderVisit(arg, [&](ObjectRef r) {
+      if (auto var = r.as<VarNode>()) {
+        if (!visit.count(var)) {
+          visit.insert(var);
+        }
+        auto it = def_map_->find(var);
+        if (it == def_map_->end()) {
+          // res.push_back(var);
+          res.push_back(ffi::GetRef<Var>(var));
+        }
+      }
+    });
+  }
+  return res;
+}
+
+bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                             const std::string &arg_name, bool with_lets,
+                             const PrimExpr &nullable_guard) {
+  // Currently only used in BindDLTensor, nullable_guard is already a defined
+  // bool, so use it directly.
+  auto MakeGuarded = [&](PrimExpr basic) -> PrimExpr {
+    // is_null || basic
+    return Or(nullable_guard, basic);
+  };
+  ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  auto BindVar = [&](const VarNode *v, PrimExpr value) {
+    auto v_arg = ffi::GetRef<Var>(v);
+    defs_.emplace_back(v_arg);
+    if (with_lets) {
+      (*def_map_)[v] = value;
+      init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+    } else {
+      (*def_map_)[v] = value;
+    }
+  };
+  // 1. simple binding var = value
+  if (const VarNode *v = arg.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it == def_map_->end()) {
+      BindVar(v, value);
+      // First time binding: identical behavior as Bind_
+      return true;
+    } else {
+      // Second or later binding: add is_null short-circuit
+      PrimExpr cond = value == it->second;
+      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
+    }
+  } else {
+    // 2. complex binding expr = value
+    //  get undefined variables
+    auto undefs = ffi::Array<Var>(getUndefVars({arg}));
+    if (!undefs.empty()) {
+      // if value is not integer, such as float, we are unable to solve it
+      if (!value.dtype().is_int() && !value.dtype().is_uint()) {
+        LOG(FATAL) << "Unable to solve non-integer variables " << undefs
+                   << " from equation `" << value << "`";
+      }
+      arith::IntConstraints constraints(undefs, {}, {arg == value});
+      auto sol = arith::SolveLinearEquations(constraints);
+      if (!sol->dst->variables.empty()) {
+        LOG(FATAL) << "TVM is unable to solve variables " << undefs
+                   << " from equation " << constraints;
+      }
+      for (const auto &v : undefs) {
+        auto value_opt = sol->src_to_dst.Get(v);
+        ICHECK(value_opt->defined())
+            << "Unable to solve variable `" << v << "` from expression `"
+            << (value == arg) << "`";
+        auto value = ffi::GetRef<PrimExpr>(sol->src_to_dst.Get(v)->get());
+        BindVar(v.as<VarNode>(), value);
+      }
+    }
+    // we must add the assert again
+    //    because the solved expression may contain floordiv (e.g. 3 * m == n
+    //    ==>   m = n // 3) we re-compute the constraint to verify the solution
+    //    is correct
+    PrimExpr cond = value == arg;
+    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
   }
+  // ICHECK(false);
+  return false;
 }
 
 bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
@@ -67,10 +196,10 @@ bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
       }
       return true;
     } else {
-      BinderAddAssert(&analyzer_, it->second == value, arg_name, &asserts_);
+      BinderAddAssert(&analyzer_, value == it->second, arg_name, &asserts_);
     }
   } else {
-    BinderAddAssert(&analyzer_, arg == value, arg_name, &asserts_);
+    BinderAddAssert(&analyzer_, value == arg, arg_name, &asserts_);
   }
   return false;
 }
@@ -96,8 +225,30 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
                            const std::string &arg_name, bool fuzzy_match) {
   ICHECK_EQ(arg.scope(), value.scope())
       << "Argument " << arg_name << " Buffer bind scope mismatch";
-  ICHECK_EQ(arg->dtype, value->dtype)
-      << "Argument " << arg_name << " Buffer bind data type mismatch";
+  // Relax dtype check to allow FP8 E4M3 variants to bind together.
+  auto dtype_compatible = [](DataType expected, DataType provided) -> bool {
+    if (expected == provided)
+      return true;
+    // If expected is float8_e4m3, allow float8_e4m3fn/float8_e4m3fnuz as well.
+    if (expected.is_float8_e4m3()) {
+      return provided.is_float8_e4m3() || provided.is_float8_e4m3fn() ||
+             provided.is_float8_e4m3fnuz();
+    }
+    // If expected is float8_e5m2, allow float8_e5m2fnuz as well.
+    if (expected.is_float8_e5m2()) {
+      return provided.is_float8_e5m2() || provided.is_float8_e5m2fnuz();
+    }
+    // If expected is bool, allow binding from int8/uint8 with same lanes.
+    if (expected.is_bool()) {
+      bool is_i8 = provided.is_int() && provided.bits() == 8;
+      bool is_u8 = provided.is_uint() && provided.bits() == 8;
+      return (is_i8 || is_u8) && expected.lanes() == provided.lanes();
+    }
+    return false;
+  };
+  ICHECK(dtype_compatible(arg->dtype, value->dtype))
+      << "Argument " << arg_name << " Buffer bind data type mismatch: expected "
+      << arg->dtype << ", got " << value->dtype;
   if (value->data_alignment % arg->data_alignment != 0) {
     LOG(WARNING) << "Trying to bind buffer to another one with lower alignment "
                     "requirement "
@@ -121,7 +272,7 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
         PrimExpr offset = value->elem_offset;
         PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
         PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
+        BinderAddAssert(&analyzer_, zero == truncmod(offset, factor),
                         arg_name + ".elem_offset", &asserts_);
       }
     }
@@ -160,215 +311,795 @@ inline PrimExpr TVMArrayGet(DataType t, Var arr,
   return TVMStructGet(t, arr, 0, kind);
 }
 
-void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                             const PrimExpr &device_id, const Var &handle,
-                             const std::string &arg_name) {
+void ArgBinder::RelaxedStrideCheck(const int dim_idx, const PrimExpr &stride,
+                                   const PrimExpr &logical_stride_val,
+                                   const PrimExpr &is_null,
+                                   const std::string &stride_element_name) {
+  if (const VarNode *v = stride.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it != def_map_->end()) {
+      PrimExpr expected = it->second;
+      if (is_zero(analyzer_.Simplify(expected))) {
+        LOG(WARNING) << "TileLang: Detected zero-dimension in "
+                     << stride_element_name << ". Relaxing stride check.";
+      }
+      PrimExpr cond = (expected == logical_stride_val) || (expected == 0);
+      BinderAddAssert(&analyzer_, cond, stride_element_name, &asserts_,
+                      is_null);
+    } else {
+      BindNullable(stride, logical_stride_val, stride_element_name, true,
+                   is_null);
+    }
+  } else {
+    const PrimExpr &expected = stride;
+    if (is_zero(analyzer_.Simplify(expected))) {
+      LOG(WARNING) << "TileLang: Detected zero-dimension in "
+                   << stride_element_name << ". Relaxing stride check.";
+    }
+    PrimExpr cond = (expected == logical_stride_val) || (expected == 0);
+    BinderAddAssert(&analyzer_, cond, stride_element_name, &asserts_, is_null);
+  }
+}
+
+void ArgBinder::BindDLTensors(
+    const std::vector<std::pair<Var, Buffer>> &buffer_def,
+    const PrimExpr &device_type, const PrimExpr &device_id,
+    const std::string &func_name,
+    const std::unordered_set<const VarNode *> &used_param_buffers) {
+  ffi::Array<Buffer> buffers;
+  ffi::Array<Var> handles;
+
+  // First pass: collect shape var -> list of (buffer_name, dim_idx, handle_ptr)
+  struct ShapeVarSource {
+    std::string buf_name;
+    size_t dim_idx;
+    const VarNode *handle_ptr; // Raw pointer to check used_param_buffers
+  };
+  std::unordered_map<const VarNode *, std::vector<ShapeVarSource>>
+      shape_var_sources;
+
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    // Scan buffer shape for symbolic variables
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      if (buffer->dtype.bits() < 8) {
+        break;
+      }
+
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        // This dimension is a symbolic variable
+        shape_var_sources[v].push_back({arg_name, k, handle.get()});
+      }
+    }
+  }
+
+  // Second pass: Create is_null vars and shape buffers for all buffers first
+  std::unordered_map<std::string, Var> is_null_map;
+  std::unordered_map<std::string, Buffer> shape_buffer_map;
+  std::unordered_map<std::string, PrimExpr>
+      is_null_expr_map; // arg_name -> is_null expression (const_false for used
+                        // buffers)
+
   const DataType tvm_shape_type = DataType::ShapeIndex();
   const DataType tvm_ndim_type = DataType::Int(32);
   const Stmt nop = Evaluate(0);
 
-  init_nest_.emplace_back(AssertStmt(
-      !Call(DataType::Bool(), builtin::isnullptr(), {handle}),
-      StringImm(arg_name + " is expected to have non-NULL DLTensor* pointer"),
-      nop));
-
-  // dimension checks
-  PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
-
-  // Helper functions for shape/stride name formatting
-  auto shape_handle_name = [&]() { return arg_name + ".shape"; };
-  auto stride_handle_name = [&]() { return arg_name + ".strides"; };
-  auto array_element_name = [&](const std::string &arr_name, size_t k) {
-    std::stringstream ss;
-    ss << arr_name << '[' << k << ']';
-    return ss.str();
-  };
-  auto shape_element_name = [&](size_t k) {
-    return array_element_name(shape_handle_name(), k);
-  };
-  auto stride_element_name = [&](size_t k) {
-    return array_element_name(stride_handle_name(), k);
-  };
+  // Create all is_null vars and shape buffers first
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
 
-  PrimExpr a_ndim =
-      make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
-  std::ostringstream ndim_err_msg;
-  ndim_err_msg << arg_name << ".ndim is expected to equal "
-               << buffer->shape.size();
-  auto msg = StringImm(ndim_err_msg.str());
-  init_nest_.emplace_back(AssertStmt(a_ndim == v_ndim, msg, nop));
-  // type checks
-  std::ostringstream type_err_msg;
-  type_err_msg << arg_name << ".dtype is expected to be " << buffer->dtype;
-  PrimExpr cond =
-      (TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode) ==
-           IntImm(DataType::UInt(8), buffer->dtype.code()) &&
-       TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits) ==
-           IntImm(DataType::UInt(8), buffer->dtype.bits()) &&
-       TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes) ==
-           IntImm(DataType::UInt(16), buffer->dtype.lanes()));
-  if (!(buffer->dtype == DataType::Int(1) ||
-        buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4))) {
-    auto type_msg = StringImm(type_err_msg.str());
-    asserts_.emplace_back(AssertStmt(cond, type_msg, nop));
-  }
+    Var is_null_var(arg_name + "_is_null", DataType::Bool());
+    init_nest_.emplace_back(
+        LetStmt(is_null_var,
+                Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
+    const PrimExpr &is_null = is_used ? const_false() : is_null_var;
 
-  // shape field
-  Buffer buf_shape =
-      decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
-                  tvm_shape_type, shape_handle_name());
-  Var v_shape(shape_handle_name(), DataType::Handle());
-  def_handle_dtype_.Set(v_shape, make_const(tvm_shape_type, 0));
-  init_nest_.emplace_back(LetStmt(
-      buf_shape->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape), nop));
-  init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
-  for (size_t k = 0; k < buffer->shape.size(); ++k) {
-    if (buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4) ||
-        buffer->dtype == DataType::Int(1)) {
-      break;
+    is_null_map[arg_name] = is_null_var;
+    is_null_expr_map[arg_name] = is_null;
+
+    if (is_used) {
+      init_nest_.emplace_back(
+          AssertStmt(!is_null_var,
+                     tvm::tir::StringImm(
+                         arg_name + " is expected to have non-NULL pointer"),
+                     nop));
     }
-    Bind_(buffer->shape[k],
-          cast(buffer->shape[k].dtype(),
-               BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)})),
-          shape_element_name(k), true);
   }
-  // strides field
-  Buffer buf_strides =
-      decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
-                  tvm_shape_type, arg_name + ".strides");
-  def_handle_dtype_.Set(buf_strides->data, tir::TypeAnnotation(tvm_shape_type));
-  init_nest_.emplace_back(LetStmt(
-      buf_strides->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrStrides), nop));
-  init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
-  PrimExpr v_strides_is_null =
-      Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
-  if (buffer->strides.empty()) {
-    // Assert the buffer is compact
-    DataType stype = buffer->DefaultIndexType();
-    PrimExpr expect_stride = make_const(stype, 1);
-    ffi::Array<PrimExpr> conds;
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      PrimExpr svalue =
-          cast(stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
-      expect_stride = expect_stride * buffer->shape[k];
+
+  // Create all shape buffers before binding any shapes
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+
+    // shape field
+    Buffer buf_shape =
+        decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
+                    tvm_shape_type, shape_handle_name());
+    def_handle_dtype_.Set(buf_shape->data, make_const(tvm_shape_type, 0));
+    // Use if_then_else for NULL guard on the shape pointer itself, avoiding
+    // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
+    init_nest_.emplace_back(
+        LetStmt(buf_shape->data,
+                tvm::if_then_else(
+                    Not(is_null),
+                    TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
+                    make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
+
+    // Save for later use in shape binding
+    shape_buffer_map[arg_name] = buf_shape;
+  }
+
+  // Now process each buffer fully
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // dimension checks
+    PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+    auto stride_handle_name = [&]() { return arg_name + ".strides"; };
+    auto array_element_name = [&](const std::string &arr_name, size_t k) {
+      std::stringstream ss;
+      ss << arr_name << '[' << k << ']';
+      return ss.str();
+    };
+    auto shape_element_name = [&](size_t k) {
+      return array_element_name(shape_handle_name(), k);
+    };
+    auto stride_element_name = [&](size_t k) {
+      return array_element_name(stride_handle_name(), k);
+    };
+
+    PrimExpr a_ndim =
+        make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
+    // Build clearer ndim message with kernel/buffer names
+    std::string kernel_nm = arg_name;
+    std::string buf_nm = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel_nm = arg_name.substr(0, dot_pos);
+      buf_nm = arg_name.substr(dot_pos + 1);
     }
-    std::ostringstream stride_err_msg;
-    stride_err_msg << stride_handle_name() << ": expected to be compact array";
-    if (!conds.empty()) {
-      auto stride_msg = StringImm(stride_err_msg.str());
-      Stmt check =
-          AssertStmt(foldl([](PrimExpr a, PrimExpr b,
-                              Span span) { return logical_and(a, b, span); },
-                           const_true(1), conds),
-                     stride_msg, Evaluate(0));
-      check = IfThenElse(Not(v_strides_is_null), check);
-      asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+    // Only check ndim when handle is non-NULL: use packed error helper
+    PrimExpr ndim_ok = (a_ndim == v_ndim);
+    ffi::Array<PrimExpr> ndim_args;
+    ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
+    ndim_args.push_back(StringImm(kernel_nm));
+    ndim_args.push_back(StringImm(buf_nm));
+    ndim_args.push_back(cast(DataType::Int(64), a_ndim));
+    ndim_args.push_back(cast(DataType::Int(64), v_ndim));
+    Stmt ndim_call = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
+    init_nest_.emplace_back(
+        SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
+                            Evaluate(0)),
+                 nop}));
+    // type checks
+    // Guard all dtype field loads by `is_null` using if_then_else
+    PrimExpr v_type_code = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
+        IntImm(DataType::UInt(8), buffer->dtype.code()));
+    PrimExpr v_type_bits = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
+        IntImm(DataType::UInt(8), buffer->dtype.bits()));
+    PrimExpr v_type_lanes = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
+        IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+    PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
+    PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
+    PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
+
+    PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
+                     v_type_lanes == expect_lanes);
+
+    // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
+    if (buffer->dtype.is_float8_e4m3()) {
+      PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
+      PrimExpr code_e4m3fn =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
+      PrimExpr code_e4m3fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
+           v_type_code == code_e4m3fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
     }
-  } else if (buffer->buffer_type == kAutoBroadcast) {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-      PrimExpr value = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape_cast, explicit_stride);
-      value = tvm::if_then_else(buffer->shape[k] == 1, make_zero(stride_dtype),
-                                value);
-      Bind_(buffer->strides[k], value, stride_element_name(k), true);
-      PrimExpr shape_extent = cast(stride_dtype, buffer->shape[k]);
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_extent);
+    // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
+    if (buffer->dtype.is_float8_e5m2()) {
+      PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
+      PrimExpr code_e5m2fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
     }
-  } else {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
-
-    for (int k = buffer->strides.size() - 1; k >= 0; k--) {
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      PrimExpr shape_stride = cast(
-          stride_dtype, BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-
-      Bind_(buffer->strides[k],
-            tvm::if_then_else(v_strides_is_null, stride_from_shape_cast,
-                              explicit_stride),
-            stride_element_name(k), true);
-
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_stride);
+    // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
+    if (buffer->dtype.is_bool()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
+      PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
+      PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
+      PrimExpr uint8_ok =
+          (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
+      // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
+      // bits=8 or bits=1.
+      PrimExpr kdlbool8_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
+      PrimExpr kdlbool1_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
+      // Also accept any dtype whose bitwidth=1, regardless of code, to be
+      // defensive.
+      PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
+      cond =
+          cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+    }
+    // Allow with bits < 8 to match any type with the same total bit count at
+    // runtime (PyTorch uses int8 as storage for FP4).
+    bool data_is_subtype = buffer->dtype.bits() < 8;
+    if (data_is_subtype) {
+      // Get the pre-created shape buffer for reading runtime shape
+      Buffer buf_shape = shape_buffer_map[arg_name];
+
+      // Calculate expected total bits using compile-time buffer->shape
+      PrimExpr expect_total_bits =
+          cast(DataType::UInt(64), expect_bits) *
+          cast(DataType::UInt(64), expect_lanes) *
+          cast(DataType::UInt(64),
+               buffer->shape.empty()
+                   ? make_const(DataType::UInt(64), 1)
+                   : foldl([](PrimExpr a, PrimExpr b, Span) { return a * b; },
+                           make_const(DataType::UInt(64), 1), buffer->shape));
+
+      // Calculate actual total bits using runtime shape from DLTensor
+      PrimExpr actual_total_bits = cast(DataType::UInt(64), v_type_bits) *
+                                   cast(DataType::UInt(64), v_type_lanes);
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        PrimExpr dim_val =
+            cast(DataType::UInt(64),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        actual_total_bits = actual_total_bits * dim_val;
+      }
+
+      PrimExpr bits_match = (actual_total_bits == expect_total_bits);
+      BinderAddAssert(&analyzer_, bits_match,
+                      arg_name + " is a subtype, but total bits mismatch",
+                      &asserts_, is_null);
+    }
+    if (!data_is_subtype) {
+      // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch
+      // occurs. Only issue the call when handle is non-NULL and cond is false.
+      ffi::Array<PrimExpr> packed_args;
+      packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
+      // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
+      // diagnostics
+      std::string kernel_name = arg_name;
+      std::string buffer_name = arg_name;
+      size_t dot_pos = arg_name.find('.');
+      if (dot_pos != std::string::npos) {
+        kernel_name = arg_name.substr(0, dot_pos);
+        buffer_name = arg_name.substr(dot_pos + 1);
+      }
+      packed_args.push_back(StringImm(kernel_name));
+      packed_args.push_back(StringImm(buffer_name));
+
+      auto i64 = DataType::Int(64);
+      // Cast to int64 for FFI function signature
+      packed_args.push_back(cast(i64, v_type_code));  // actual_code
+      packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
+      packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
+      packed_args.push_back(cast(i64, expect_code));  // expect_code
+      packed_args.push_back(cast(i64, expect_bits));  // expect_bits
+      packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
+
+      Stmt call_err = Evaluate(
+          Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
+      // Guard the call: only when handle is not null and cond fails
+      Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
+      asserts_.emplace_back(SeqStmt({guarded, nop}));
+    }
+
+    // Get the pre-created shape buffer
+    Buffer buf_shape = shape_buffer_map[arg_name];
+
+    // Bind symbolic variables from buffer shape
+    // For subtype (bits < 8), the runtime tensor has packed shape where
+    // the last dimension is divided by the packing factor k = 8 / bits.
+    // For example, fp4 (4 bits) has k=2, so logical shape [m, 16] becomes
+    // runtime shape [m, 8]. We need to solve for symbolic variables from
+    // this packed shape.
+    if (data_is_subtype) {
+      // For subtype, bind symbolic variables from the packed shape.
+      // The packing factor k = 8 / bits (number of elements packed into one
+      // storage unit)
+      int bits = buffer->dtype.bits();
+      int pack_factor = 8 / bits;
+
+      // Build a mapping from logical shape dimensions to runtime shape
+      // expressions For all dimensions except the last, runtime_shape[k] ==
+      // logical_shape[k] For the last dimension, logical_shape[-1] ==
+      // runtime_shape[-1] * pack_factor
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        // Read the runtime shape value from DLTensor
+        PrimExpr raw_shape_val =
+            cast(buffer->shape[k].dtype(),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        PrimExpr runtime_shape_val = tvm::if_then_else(
+            Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+
+        // For the last dimension, the logical shape = runtime_shape *
+        // pack_factor
+        PrimExpr logical_shape_val;
+        bool is_last_dim = (k == buffer->shape.size() - 1);
+        if (is_last_dim) {
+          logical_shape_val =
+              runtime_shape_val *
+              make_const(runtime_shape_val.dtype(), pack_factor);
+        } else {
+          logical_shape_val = runtime_shape_val;
+        }
+
+        // Now bind the symbolic variable if present
+        if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+          auto it = def_map_->find(v);
+          if (it == def_map_->end()) {
+            // First time binding: use BindNullable which can solve equations
+            BindNullable(buffer->shape[k], logical_shape_val,
+                         shape_element_name(k), true, is_null);
+          } else {
+            // Variable already bound, add assertion with nullable guard
+            PrimExpr cond = (it->second == logical_shape_val);
+            BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                            is_null);
+          }
+        } else {
+          // Constant or expression dimension, bind/assert via BindNullable
+          BindNullable(buffer->shape[k], logical_shape_val,
+                       shape_element_name(k), true, is_null);
+        }
+      }
+    } else {
+      // Non-subtype: normal shape binding
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        // The "real" runtime shape value read from DLTensor.
+        // Guard the load with `is_null` to avoid dereferencing NULL handles.
+        PrimExpr raw_shape_val =
+            cast(buffer->shape[k].dtype(),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        PrimExpr shape_val = tvm::if_then_else(
+            Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+
+        // Check if this dimension is a symbolic variable
+        if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+          auto it = def_map_->find(v);
+          if (it == def_map_->end()) {
+            // First time binding this symbolic variable
+            auto sources_it = shape_var_sources.find(v);
+            if (sources_it != shape_var_sources.end() &&
+                sources_it->second.size() > 1) {
+              // This variable appears in multiple buffers
+              // Assert that at least one buffer is non-null
+              PrimExpr any_nonnull = const_false();
+              for (const auto &src : sources_it->second) {
+                bool buf_is_used = used_param_buffers.count(src.handle_ptr);
+                if (buf_is_used) {
+                  any_nonnull = const_true();
+                  break;
+                }
+                Var src_is_null = is_null_map[src.buf_name];
+                any_nonnull = Or(any_nonnull, Not(src_is_null));
+              }
+
+              std::ostringstream err_msg;
+              err_msg << "Symbolic shape variable "
+                      << ffi::GetRef<Var>(v)->name_hint
+                      << " requires at least one non-null buffer among: ";
+              bool first = true;
+              for (const auto &src : sources_it->second) {
+                if (!first)
+                  err_msg << ", ";
+                err_msg << src.buf_name;
+                first = false;
+              }
+
+              init_nest_.emplace_back(AssertStmt(
+                  any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
+
+              // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
+              // if !is_null_b then b.shape[k] ... We need to construct this in
+              // reverse order
+              PrimExpr cascaded_value;
+              bool is_first_source = true;
+
+              for (auto rit = sources_it->second.rbegin();
+                   rit != sources_it->second.rend(); ++rit) {
+                const auto &src = *rit;
+
+                // Get the shape buffer for this source
+                auto it_buf = shape_buffer_map.find(src.buf_name);
+                if (it_buf == shape_buffer_map.end()) {
+                  LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
+                }
+                Buffer src_shape_buf = it_buf->second;
+
+                // Construct the shape load and guard it if the source may be
+                // NULL
+                PrimExpr src_raw_shape_val =
+                    cast(buffer->shape[k].dtype(),
+                         BufferLoad(src_shape_buf,
+                                    {IntImm(DataType::Int(32),
+                                            static_cast<int>(src.dim_idx))}));
+
+                // Check if this buffer is used (non-nullable)
+                bool src_is_used = used_param_buffers.count(src.handle_ptr);
+
+                if (is_first_source) {
+                  // Base case: use this shape value directly (we know at least
+                  // one is non-null from assert)
+                  if (src_is_used) {
+                    cascaded_value = src_raw_shape_val;
+                  } else {
+                    Var src_is_null = is_null_map[src.buf_name];
+                    cascaded_value = tvm::if_then_else(
+                        Not(src_is_null), src_raw_shape_val,
+                        make_const(src_raw_shape_val.dtype(), 0));
+                  }
+                  is_first_source = false;
+                } else {
+                  // if !is_null then use this shape, else use previous cascaded
+                  // value But if buffer is used (non-nullable), always use its
+                  // shape
+                  if (src_is_used) {
+                    cascaded_value = src_raw_shape_val;
+                  } else {
+                    Var src_is_null = is_null_map[src.buf_name];
+                    cascaded_value = tvm::if_then_else(
+                        Not(src_is_null), src_raw_shape_val, cascaded_value);
+                  }
+                }
+              }
+
+              // Bind the variable to the cascaded expression
+              Var v_arg = ffi::GetRef<Var>(v);
+              defs_.emplace_back(v_arg);
+              (*def_map_)[v] = cascaded_value;
+              init_nest_.emplace_back(
+                  LetStmt(v_arg, cascaded_value, Evaluate(0)));
+            } else {
+              // Single source or no special handling needed, use nullable
+              // binding. When the only source is NULL, bind m to 0 safely.
+              BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
+                           true, is_null);
+            }
+          } else {
+            // Variable already bound, add assertion with nullable guard
+            PrimExpr cond = (it->second == shape_val);
+            BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                            is_null);
+          }
+        } else {
+          // Constant dimension, just add assertion
+          BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
+                       is_null);
+        }
+      }
+    }
+
+    // strides field
+    // For subbyte types (bits < 8), stride semantics need special handling
+    // due to packed storage. The relationship is:
+    // - Last dimension: logical_stride = runtime_stride (packing is within
+    // elements)
+    // - Other dimensions: logical_stride = runtime_stride * pack_factor
+    if (data_is_subtype) {
+      // For subtype, only process strides if there are explicit strides
+      // with symbolic variables that need binding
+      if (!buffer->strides.empty()) {
+        int bits = buffer->dtype.bits();
+        int pack_factor = 8 / bits;
+
+        Buffer buf_strides =
+            decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                        tvm_shape_type, arg_name + ".strides");
+        def_handle_dtype_.Set(buf_strides->data,
+                              tir::TypeAnnotation(tvm_shape_type));
+        init_nest_.emplace_back(
+            LetStmt(buf_strides->data,
+                    tvm::if_then_else(Not(is_null),
+                                      TVMArrayGet(DataType::Handle(), handle,
+                                                  builtin::kArrStrides),
+                                      make_zero(DataType::Handle())),
+                    nop));
+        init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+        PrimExpr v_strides_is_null =
+            Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+        for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0;
+             --k) {
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr runtime_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+          runtime_stride =
+              tvm::if_then_else(Or(is_null, v_strides_is_null),
+                                make_const(stride_dtype, 0), runtime_stride);
+
+          // For the last dimension, logical stride = runtime stride
+          // For other dimensions, logical stride = runtime stride * pack_factor
+          PrimExpr logical_stride_val;
+          bool is_last_dim =
+              (k == static_cast<int>(buffer->strides.size()) - 1);
+          if (is_last_dim) {
+            logical_stride_val = runtime_stride;
+          } else {
+            logical_stride_val =
+                runtime_stride * make_const(stride_dtype, pack_factor);
+          }
+
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          RelaxedStrideCheck(k, buffer->strides[k], logical_stride_val, is_null,
+                             stride_element_name(k));
+        }
+      }
+    } else {
+      // Non-subtype: normal stride handling
+      Buffer buf_strides =
+          decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                      tvm_shape_type, arg_name + ".strides");
+      def_handle_dtype_.Set(buf_strides->data,
+                            tir::TypeAnnotation(tvm_shape_type));
+      init_nest_.emplace_back(
+          LetStmt(buf_strides->data,
+                  tvm::if_then_else(Not(is_null),
+                                    TVMArrayGet(DataType::Handle(), handle,
+                                                builtin::kArrStrides),
+                                    make_zero(DataType::Handle())),
+                  nop));
+      init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+      PrimExpr v_strides_is_null =
+          Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+      if (buffer->strides.empty()) {
+        // Assert the buffer is compact
+        DataType stype = buffer->DefaultIndexType();
+        PrimExpr expect_stride = make_const(stype, 1);
+        ffi::Array<PrimExpr> conds;
+        for (size_t i = buffer->shape.size(); i != 0; --i) {
+          size_t k = i - 1;
+          PrimExpr svalue = cast(
+              stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32),
+                                                     static_cast<int>(k))}));
+          if (is_zero(analyzer_.Simplify(expect_stride))) {
+            LOG(WARNING) << "TileLang: Detected zero-dimension in compact "
+                         << "buffer strides calculation. "
+                         << "Relaxing check for " << stride_handle_name();
+          }
+          conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue ||
+                          expect_stride == 0);
+          expect_stride = expect_stride * buffer->shape[k];
+        }
+        std::ostringstream stride_err_msg;
+        stride_err_msg
+            << stride_handle_name()
+            << ": expected to be compact array, but got non-compact strides";
+        if (!conds.empty()) {
+          PrimExpr all_ok =
+              foldl([](PrimExpr a, PrimExpr b,
+                       Span span) { return logical_and(a, b, span); },
+                    const_true(1), conds);
+          // Packed generic violation for non-compact strides
+          std::string kernel_nm3 = arg_name;
+          std::string buf_nm3 = arg_name;
+          size_t dot_pos3 = arg_name.find('.');
+          if (dot_pos3 != std::string::npos) {
+            kernel_nm3 = arg_name.substr(0, dot_pos3);
+            buf_nm3 = arg_name.substr(dot_pos3 + 1);
+          }
+          ffi::Array<PrimExpr> pargs4;
+          pargs4.push_back(StringImm(tvm_error_constraint_violation));
+          pargs4.push_back(StringImm(kernel_nm3));
+          pargs4.push_back(StringImm(buf_nm3));
+          pargs4.push_back(StringImm("strides"));
+          Stmt call_err4 = Evaluate(
+              Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
+          // Only check when strides array is present and condition fails
+          Stmt check =
+              IfThenElse(Not(v_strides_is_null),
+                         IfThenElse(Not(all_ok), call_err4), Evaluate(0));
+          asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+        }
+      } else if (buffer->buffer_type == kAutoBroadcast) {
+        PrimExpr stride_from_shape = 1;
+        for (size_t i = buffer->shape.size(); i != 0; --i) {
+          size_t k = i - 1;
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr explicit_stride = cast(
+              stride_dtype,
+              BufferLoad(buf_strides,
+                         {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+          PrimExpr stride_val = tvm::if_then_else(
+              v_strides_is_null, stride_from_shape, explicit_stride);
+
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          RelaxedStrideCheck(k, buffer->strides[k], stride_val, is_null,
+                             stride_element_name(k));
+        }
+      } else {
+        PrimExpr stride_from_shape = 1;
+
+        for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0;
+             --k) {
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr explicit_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+          PrimExpr shape_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
+
+          PrimExpr stride_val = tvm::if_then_else(
+              v_strides_is_null, stride_from_shape, explicit_stride);
+
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          RelaxedStrideCheck(k, buffer->strides[k], stride_val, is_null,
+                             stride_element_name(k));
+        }
+      }
     }
-  }
-  // Byte_offset field.
-  int data_bytes = GetVectorBytes(buffer->dtype);
 
-  if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    Bind_(make_const(DataType::UInt(64), const_offset->value * data_bytes),
+    // Byte_offset field.
+    int data_bytes = GetVectorBytes(buffer->dtype);
+
+    if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
+      // Constant elem_offset: only need consistency check, no need for
+      // additional Var binding.
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
           TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
-          arg_name + ".byte_offset", true);
-  } else {
-    if (Bind_(buffer->elem_offset,
-              cast(buffer->elem_offset.dtype(),
-                   (TVMArrayGet(DataType::UInt(64), handle,
-                                builtin::kArrByteOffset) /
-                    make_const(DataType::UInt(64), data_bytes))),
-              arg_name + ".elem_offset", true)) {
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_byte_offset =
+          make_const(DataType::UInt(64), const_offset->value * data_bytes);
+      PrimExpr ok = (expect_byte_offset == actual_byte_offset);
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
+      pargs.push_back(StringImm(kernel_nm));
+      pargs.push_back(StringImm(buf_nm));
+      pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
+      pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
+           nop}));
+    } else {
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_elem_off = cast(
+          buffer->elem_offset.dtype(),
+          (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
+
+      BindNullable(buffer->elem_offset, expect_elem_off,
+                   arg_name + ".elem_offset", true, is_null);
+
       if (buffer->offset_factor > 1) {
         PrimExpr offset = buffer->elem_offset;
         PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
         PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
-                        arg_name + ".elem_offset", &asserts_);
+        BindNullable(offset, truncmod(offset, factor),
+                     arg_name + ".elem_offset", true, is_null);
       }
     }
-  }
-  // device info.
-  Bind_(device_type,
+
+    // device info.
+    // Define device_id from handle when available (so later passes can use it)
+    PrimExpr actual_dev_type = tvm::if_then_else(
+        Not(is_null),
         TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
-        arg_name + ".device_type", true);
-  Bind_(device_id,
+        make_zero(DataType::Int(32)));
+    PrimExpr actual_dev_id = tvm::if_then_else(
+        Not(is_null),
         TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
-        arg_name + ".device_id", true);
-
-  // Data field.  Because the validation of the data field may depend
-  // on a dynamic size defined by the other DLTensor* parameters, this
-  // field must be generated last.
-  if (Bind_(buffer->data,
-            TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
-            arg_name + ".data", true)) {
-    Var vptr(buffer->data);
-
-    // Check if the data pointer is NULL.  This check is skipped for
-    // size-0 arrays, since CUDA provides a NULL pointer for size-zero
-    // allocations.
-    auto alloc_size = [&]() -> PrimExpr {
-      PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
+        make_zero(DataType::Int(32)));
+
+    // Bind device_id to a safe expression (0 when NULL handle)
+    BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
+                 is_null);
+    // Check device_type consistency (device_id equality is implicitly ensured
+    // by binding above)
+    {
+      PrimExpr ok = (device_type == actual_dev_type);
+      ffi::Array<PrimExpr> pargs2;
+      pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
+      pargs2.push_back(StringImm(kernel_nm));
+      pargs2.push_back(StringImm(buf_nm));
+      pargs2.push_back(cast(DataType::Int(64), device_type));
+      pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
+      Stmt call_err2 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
+      asserts_.emplace_back(
+          SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2),
+                              Evaluate(0)),
+                   Evaluate(0)}));
+    }
+
+    // Data field.  Because the validation of the data field may depend
+    // on a dynamic size defined by the other DLTensor* parameters, this
+    // field must be generated last.
+    // Bind data pointer using expression-level guard to avoid deref on NULL.
+    {
+      Var vptr(buffer->data);
+      PrimExpr data_ptr = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
+          make_zero(DataType::Handle()));
+      BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
+
+      // Check if the data pointer is NULL.  This check is skipped for
+      // size-0 arrays and also skipped when handle itself is NULL.
+      PrimExpr alloc_size = IntImm(buffer->DefaultIndexType(), 1);
       for (const auto &dim : buffer->shape) {
-        product *= dim;
+        alloc_size = alloc_size * dim;
       }
-      return product;
-    }();
-    asserts_.emplace_back(AssertStmt(
-        alloc_size == 0 ||
-            !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-        StringImm(arg_name + " is expected to have non-NULL data pointer"),
-        nop));
-
-    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
-    // mark alignment of external bufs
-    init_nest_.emplace_back(
-        AttrStmt(vptr, tir::attr::storage_alignment,
-                 IntImm(DataType::Int(32), buffer->data_alignment), nop));
+      // Improve message: kernel/buffer naming for data pointer null check
+      std::string kernel_nm2 = arg_name;
+      std::string buf_nm2 = arg_name;
+      size_t dot_pos2 = arg_name.find('.');
+      if (dot_pos2 != std::string::npos) {
+        kernel_nm2 = arg_name.substr(0, dot_pos2);
+        buf_nm2 = arg_name.substr(dot_pos2 + 1);
+      }
+      // expand combined condition via nested IfThenElse for portability
+      ffi::Array<PrimExpr> pargs3;
+      pargs3.push_back(StringImm(tvm_error_null_ptr));
+      pargs3.push_back(StringImm(kernel_nm2));
+      pargs3.push_back(StringImm(buf_nm2));
+      pargs3.push_back(StringImm("data pointer"));
+      Stmt call_err3 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null),
+                      IfThenElse(Not(alloc_size == 0),
+                                 IfThenElse(Call(DataType::Bool(),
+                                                 builtin::isnullptr(), {vptr}),
+                                            call_err3),
+                                 Evaluate(0)),
+                      Evaluate(0)),
+           nop}));
+
+      // mark alignment of external bufs
+      init_nest_.emplace_back(
+          AttrStmt(vptr, tir::attr::storage_alignment,
+                   IntImm(DataType::Int(32), buffer->data_alignment), nop));
+
+      def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+    }
   }
 }
 
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index d04e7e9b2..7db5de92b 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -95,17 +95,21 @@ class ArgBinder {
    */
   void BindBuffer(const Buffer &arg, const Buffer &value,
                   const std::string &arg_name, bool fuzzy_match);
+
   /*!
    * \brief Bind symbolic buffer to a DLTensor handle.
    * \param buffer The argument buffer to be binded.
-   * \param device_type The device id to be binded.
+   * \param device_type The device type to be binded.
    * \param device_id The device id to be binded.
-   * \param handle The DLTensor handle.
-   * \param arg_name argument name.
+   * \param buffer_def The buffer definition.
+   * \param func_name The function name.
+   * \param used_param_buffers The used param buffers.
    */
-  void BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                    const PrimExpr &device_id, const Var &handle,
-                    const std::string &arg_name);
+  void
+  BindDLTensors(const std::vector<std::pair<Var, Buffer>> &buffer_def,
+                const PrimExpr &device_type, const PrimExpr &device_id,
+                const std::string &func_name,
+                const std::unordered_set<const VarNode *> &used_param_buffers);
 
   /*! \return The defs generated in binding. */
   const std::vector<Var> &defs() const { return defs_; }
@@ -154,7 +158,17 @@ class ArgBinder {
     return def_handle_dtype_;
   }
 
+  bool BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                    const std::string &arg_name, bool with_lets,
+                    const PrimExpr &nullable_guard);
+
+  void RelaxedStrideCheck(const int dim_idx, const PrimExpr &stride,
+                          const PrimExpr &logical_stride_val,
+                          const PrimExpr &is_null,
+                          const std::string &stride_element_name);
+
 private:
+  std::vector<Var> getUndefVars(const std::vector<PrimExpr> &arg);
   // Internal bind function
   bool Bind_(const PrimExpr &arg, const PrimExpr &value,
              const std::string &arg_name, bool with_lets);
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
deleted file mode 100644
index 40cb81402..000000000
--- a/src/transform/atomicadd_vectorize.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/*!
- * \file atomicadd_vectorize.cc
- * \brief A tool to automatically vectorize atomic add
- */
-
-#include "atomicadd_vectorize.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-using arith::IRVisitorWithAnalyzer;
-
-AtomicAddVectorizePlanner::AtomicAddVectorizePlanner() = default;
-
-AtomicAddVectorizePlanResult
-AtomicAddVectorizePlanner::Plan(const For &node, int compute_capability) {
-  int vectorize_size_max = 1;
-  this->vector_size_ = 4;
-  this->dynamic_ = false;
-  this->condition_ = PrimExpr();
-
-  PostOrderVisit(node, [&](const ObjectRef &obj) {
-    if (const auto *call = obj.as<CallNode>()) {
-      if (call->op == atomicadd_elem_op()) {
-        if (call->args.size() < 2) {
-          // Fallback: unexpected arity
-          vectorize_size_max = 1;
-          DLOG(WARNING) << "[AtomicAddVectorizePlanner] atomicadd_elem_op "
-                           "expects 2 args, got "
-                        << call->args.size() << "; Fallback to no vectorize";
-          return;
-        }
-        DataType dtype;
-        if (const auto *load = call->args[0].as<BufferLoadNode>()) {
-          dtype = load->dtype;
-          vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-        } else if (const auto *ite = call->args[0].as<IfThenElseNode>()) {
-          if (const auto *then_load = ite->then_case.as<BufferLoadNode>()) {
-            dtype = then_load->dtype;
-            vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-          } else if (const auto *else_load =
-                         ite->else_case.as<BufferLoadNode>()) {
-            dtype = else_load->dtype;
-            vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-          } else {
-            // fallback
-            vectorize_size_max = 1;
-            DLOG(WARNING) << "[AtomicAddVectorizePlanner] IfThenElse case "
-                             "has no BufferLoad; Fallback to no vectorize";
-          }
-        } else {
-          // fallback
-          vectorize_size_max = 1;
-          DLOG(WARNING) << "[AtomicAddVectorizePlanner] Unexpected arg1 type "
-                        << call->args[1]->GetTypeKey()
-                        << "; Fallback to no vectorize";
-        }
-      }
-    }
-  });
-
-  if (vectorize_size_max <= 1) {
-    return {1, dynamic_, condition_};
-  }
-
-  this->max_vector_size = vectorize_size_max;
-  this->operator()(node);
-  return {vector_size_, dynamic_, condition_};
-}
-
-void AtomicAddVectorizePlanner::VisitStmt_(const ForNode *node) {
-  inner_for_ = node;
-  arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-}
-
-void AtomicAddVectorizePlanner::VisitExpr_(const CallNode *node) {
-  if (node->op == atomicadd_elem_op() && !node->args.empty()) {
-    if (node->args.size() < 2) {
-      return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-    }
-    const BufferLoadNode *buffer_load_dst = node->args[0].as<BufferLoadNode>();
-    const BufferLoadNode *buffer_load_src = node->args[1].as<BufferLoadNode>();
-    if (buffer_load_src && buffer_load_src->buffer.defined() &&
-        buffer_load_dst && buffer_load_dst->buffer.defined()) {
-      Buffer dst_buffer = buffer_load_dst->buffer;
-      UpdateVectorSize(buffer_load_dst->indices, dst_buffer);
-
-      Buffer src_buffer = buffer_load_src->buffer;
-      UpdateVectorSize(buffer_load_src->indices, src_buffer);
-    }
-  }
-  return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-}
-
-int AtomicAddVectorizePlanner::GetVectorizeSizeMax(int compute_capability,
-                                                   DataType dtype) {
-  if (dtype == DataType::Float(16)) {
-    return 2;
-  }
-  if (dtype == DataType::BFloat(16)) {
-    return compute_capability > 75 ? 2 : 1;
-  }
-  if (dtype == DataType::Float(32)) {
-    return compute_capability >= 90 ? 4 : 1;
-  }
-  return 1;
-}
-
-void AtomicAddVectorizePlanner::UpdateVectorSize(const Array<PrimExpr> &indices,
-                                                 const Buffer &buffer) {
-  if (!inner_for_)
-    return;
-  auto extent_ptr = inner_for_->extent.as<IntImmNode>();
-  if (!extent_ptr)
-    return;
-
-  const DataType &access_type = buffer->dtype;
-  max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
-
-  auto last_dim = buffer->shape.back();
-  auto mod_set = analyzer_.modular_set(last_dim);
-
-  if (buffer->shape.back().as<IntImmNode>()) {
-    max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
-    auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
-
-    if (gcd_base < Downcast<IntImm>(last_dim)->value) {
-      max_vector_size = gcd_base;
-    }
-
-    vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
-
-    PrimExpr elem_offset = 0;
-    PrimExpr stride = 1;
-    for (int i = indices.size() - 1; i >= 0; --i) {
-      elem_offset = elem_offset + indices[i] * stride;
-      stride = stride * buffer->shape[i];
-    }
-
-    while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
-      vector_size_ /= 2;
-    }
-  } else if (vector_size_ <= 4) {
-    dynamic_ = true;
-    PrimExpr offset = buffer.OffsetOf(indices).back();
-    condition_ = (truncmod(offset, vector_size_) == 0);
-  }
-}
-
-class AtomicAddVectorizeRewriter : public StmtExprMutator {
-public:
-  AtomicAddVectorizeRewriter(const AtomicAddVectorizePlanResult &plan)
-      : vector_size_(plan.vector_size), dynamic_(plan.dynamic),
-        condition_(plan.condition) {}
-
-private:
-  /**
-   * @brief Visits a For node and rewrites the innermost loop for atomic-add
-   * vectorization.
-   *
-   * If the visited For node is the recorded innermost loop, this method
-   * validates that the loop extent is a constant, divisible by the planned
-   * vector size, and has a zero minimum. When vectorization is enabled
-   * (dynamic_ == false) it:
-   *  - locates the thread index variable named "tx" inside the loop body,
-   *  - creates a new outer loop variable named "<old_loop_var>_outer",
-   *  - substitutes occurrences of `tx` with `tx * vector_size_` and the old
-   * loop var with `outer_var * vector_size_` so each outer iteration maps to a
-   * contiguous vector-sized chunk,
-   *  - returns a new For with extent divided by vector_size_ and the
-   * transformed body.
-   *
-   * If dynamic_ is true, the method returns the (possibly mutated) inner For
-   * unchanged.
-   *
-   * Side effects:
-   *  - updates inner_for_ to point to the current For node during visitation.
-   *  - performs runtime checks (ICHECK) to enforce: constant extent, extent %
-   * vector_size_ == 0, and zero loop minimum; violations terminate execution.
-   *
-   * @return The original or transformed For statement as a Stmt.
-   */
-  Stmt VisitStmt_(const ForNode *node) final {
-    inner_for_ = node;
-    auto ret = StmtExprMutator::VisitStmt_(node);
-    if (vector_size_ == 1)
-      return ret;
-    if (inner_for_ == node) {
-      For fnode = ret.as<For>().value();
-      auto old_var = fnode->loop_var;
-      auto new_var = Var(old_var->name_hint);
-      auto extent_ptr = as_const_int(fnode->extent);
-      ICHECK(extent_ptr) << fnode->extent;
-      int extent = *extent_ptr;
-      ICHECK(extent % vector_size_ == 0)
-          << "extent: " << extent << " vector_size_: " << vector_size_;
-      ICHECK(is_zero(fnode->min));
-      if (!dynamic_) {
-        Map<Var, PrimExpr> vmap;
-        vmap.Set(old_var, new_var * vector_size_);
-        Stmt body = Substitute(fnode->body, vmap);
-        return For(new_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
-      }
-    }
-    return ret;
-  }
-
-  PrimExpr VisitExpr_(const CallNode *node) final {
-    bool legal_vectorize = true;
-    if (dynamic_)
-      legal_vectorize = false;
-    if (!(node->op == atomicadd_elem_op()))
-      legal_vectorize = false;
-    if (node->args.size() < 2)
-      legal_vectorize = false;
-    if (legal_vectorize) {
-      const BufferLoadNode *temp_dst_node = node->args[0].as<BufferLoadNode>();
-      const BufferLoadNode *temp_value_node =
-          node->args[1].as<BufferLoadNode>();
-      if (!temp_dst_node || !temp_value_node)
-        legal_vectorize = false;
-    }
-    if (legal_vectorize) {
-      const BufferLoad dst_node = Downcast<BufferLoad>(node->args[0]);
-      const BufferLoad value_node = Downcast<BufferLoad>(node->args[1]);
-      // The default memory order is relaxed
-      // Ref: src/tl_templates/cuda/atomic.h::AtomicAdd
-      const IntImm memory_order =
-          node->args.size() >= 3 ? Downcast<IntImm>(node->args[2]) : IntImm(0);
-      Array<PrimExpr> new_args;
-      Call address_of_dst =
-          Call(DataType::Handle(), builtin::address_of(), {dst_node});
-      Call address_of_value =
-          Call(DataType::Handle(), builtin::address_of(), {value_node});
-      if (vector_size_ == 4) {
-        new_args.push_back(StringImm("AtomicAddx4"));
-        new_args.push_back(address_of_dst);
-        new_args.push_back(address_of_value);
-      } else if (vector_size_ == 2) {
-        new_args.push_back(StringImm("AtomicAddx2"));
-        new_args.push_back(address_of_dst);
-        new_args.push_back(address_of_value);
-      } else {
-        new_args.push_back(StringImm("AtomicAdd"));
-        new_args.push_back(dst_node);
-        new_args.push_back(value_node);
-      }
-      new_args.push_back(memory_order);
-
-      Call new_call =
-          tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
-
-      return new_call;
-    } else {
-      Array<PrimExpr> new_args;
-      new_args.push_back(StringImm("AtomicAdd"));
-      for (auto x : node->args)
-        new_args.push_back(x);
-
-      Call new_call =
-          tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
-
-      return new_call;
-    }
-  }
-
-  const ForNode *inner_for_;
-  const int vector_size_;
-  const PrimExpr condition_;
-  const bool dynamic_;
-};
-
-For VectorizeAtomicAdd(const For &for_node, int compute_capability) {
-  AtomicAddVectorizePlanResult res = {1, false, 0};
-  AtomicAddVectorizePlanner planner;
-  res = planner.Plan(for_node, compute_capability);
-  auto rewriter = AtomicAddVectorizeRewriter(res);
-  return Downcast<For>(rewriter(for_node));
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
deleted file mode 100644
index a55bc0f4a..000000000
--- a/src/transform/atomicadd_vectorize.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*!
- * \file atomicadd_vectorize.h
- * \brief A tool to automatically vectorize a for atomicadd
- */
-
-#ifndef TVM_TL_ATOMICADD_VECTORIZE_H_
-#define TVM_TL_ATOMICADD_VECTORIZE_H_
-
-#include "../layout/layout.h"
-#include "../layout/utils.h"
-#include "../op/builtin.h"
-#include "arith/int_operator.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include "atomicadd_vectorize.h"
-#include "common/loop_vectorization_utils.h"
-#include <numeric>
-#include <tvm/arith/analyzer.h>
-#include <tvm/arith/iter_affine_map.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-#include <utility>
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-
-For VectorizeAtomicAdd(const For &for_node, int compute_capability);
-
-struct AtomicAddVectorizePlanResult {
-  int vector_size;
-  bool dynamic;
-  PrimExpr condition;
-};
-
-class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer {
-public:
-  AtomicAddVectorizePlanner();
-
-  AtomicAddVectorizePlanResult Plan(const For &node, int compute_capability);
-
-private:
-  void VisitStmt_(const ForNode *node) final;
-  void VisitExpr_(const CallNode *node) final;
-
-  int GetVectorizeSizeMax(int compute_capability, DataType dtype);
-  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer);
-
-  const ForNode *inner_for_ = nullptr;
-  bool has_nonlocal_memory_access_ = false;
-  int vector_size_ = 4;
-  int max_vector_size = 1;
-  bool dynamic_ = false;
-  PrimExpr condition_;
-};
-
-} // namespace tl
-} // namespace tvm
-
-#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
\ No newline at end of file
diff --git a/src/transform/common/assume.cc b/src/transform/common/assume.cc
new file mode 100644
index 000000000..cb51d0f8a
--- /dev/null
+++ b/src/transform/common/assume.cc
@@ -0,0 +1,33 @@
+
+/*!
+ * \file assume.cc
+ * \brief Utils on assume statements
+ */
+
+#include "assume.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt) {
+  auto eval = stmt.as<EvaluateNode>();
+  if (!eval)
+    return std::nullopt;
+  auto call = eval->value.as<CallNode>();
+  if (!call)
+    return std::nullopt;
+  if (!call->op.same_as(builtin::assume()))
+    return std::nullopt;
+  return call->args[0];
+}
+
+bool IsAssumeInEvaluateForm(const Stmt &stmt) {
+  return GetAssumeExprInEvaluateForm(stmt).has_value();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h
new file mode 100644
index 000000000..db830818e
--- /dev/null
+++ b/src/transform/common/assume.h
@@ -0,0 +1,28 @@
+
+/*!
+ * \file assume.h
+ * \brief Utils on assume statements
+ */
+
+#ifndef TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+#define TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+
+#include "tvm/tir/stmt.h"
+#include <optional>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Get the expression inside an assume statement, if any. Returns nullopt if
+// the statement is not an assume statement.
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt);
+
+// Check if a statement is an assume statement.
+bool IsAssumeInEvaluateForm(const Stmt &stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
diff --git a/src/transform/common/attr.h b/src/transform/common/attr.h
index d71ee6723..74e6ac15c 100644
--- a/src/transform/common/attr.h
+++ b/src/transform/common/attr.h
@@ -11,5 +11,11 @@ constexpr const char *MainBlockName = "tilelang_root";
 constexpr const char *tilelang_is_cpu_kernel_frame =
     "tilelang.is_cpu_kernel_frame";
 
+namespace attr {
+// Attributes to mark CUDA sync calls
+constexpr const char *kHasTriggerLaunch = "has_cuda_pdl_trigger";
+constexpr const char *kHasGridSync = "has_cuda_pdl_sync";
+} // namespace attr
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/common/constr_visitor.h b/src/transform/common/constr_visitor.h
new file mode 100644
index 000000000..af7ae36d6
--- /dev/null
+++ b/src/transform/common/constr_visitor.h
@@ -0,0 +1,254 @@
+#ifndef TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
+#define TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
+
+#include "tvm/arith/analyzer.h"
+#include "tvm/ffi/base_details.h"
+#include "tvm/ffi/object.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/op.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/var.h"
+#include <ostream>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm::tl {
+
+struct Constr {
+
+  enum Kind {
+    kConstr,
+    kBindValue,
+    kBindRange,
+  } kind;
+  bool is_assume = false;
+  tir::Var var;
+  PrimExpr value;
+  Range range;
+
+  Constr(PrimExpr constr, bool is_assume = false)
+      : kind(kConstr), value(constr), is_assume(is_assume) {};
+  Constr(tir::Var var, PrimExpr val)
+      : kind(kBindValue), var(var), value(val) {};
+  Constr(tir::Var var, Range range)
+      : kind(kBindRange), var(var), range(range) {};
+
+  Constr() = default;
+  Constr(const Constr &other) = default;
+  Constr(Constr &&other) = default;
+  Constr &operator=(const Constr &other) = default;
+
+  void format(std::ostream &os) const {
+    os << "Constr(kind=";
+    switch (kind) {
+    case kConstr:
+      os << "kConstr";
+      os << ", is_assume=" << (is_assume ? "true" : "false");
+      os << ", value=" << value;
+      break;
+    case kBindValue:
+      os << "kBindValue";
+      os << ", var=" << var->name_hint;
+      os << ", value=" << value;
+      break;
+    case kBindRange:
+      os << "kBindRange";
+      os << ", var=" << var->name_hint;
+      os << ", range=Range(min=" << range->min;
+      os << ", extent=" << range->extent << ")";
+      break;
+    default:
+      os << "Unknown";
+    }
+    os << ")";
+  }
+
+  PrimExpr ToGenericConstr() const {
+    switch (kind) {
+    case kConstr:
+      return value;
+    case kBindValue:
+      return var == value;
+    case kBindRange:
+      return tir::And(var >= range->min, var < (range->min + range->extent));
+    }
+    LOG(FATAL) << "Unreachable";
+    return PrimExpr();
+  }
+  Constr Substitute(ffi::Map<tir::Var, PrimExpr> subs) const {
+    return Constr(tir::Substitute(ToGenericConstr(), subs));
+  }
+  void Populate(arith::Analyzer &analyzer) const {
+    switch (kind) {
+    case kConstr:
+      analyzer.EnterConstraint(value);
+      break;
+    case kBindValue:
+      analyzer.Bind(var, value);
+      break;
+    case kBindRange:
+      analyzer.Bind(var, range);
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+    }
+  }
+};
+
+struct ConstrSet {
+  ConstrSet Substitute(ffi::Map<tir::Var, PrimExpr> subs) const {
+    ConstrSet new_set;
+    for (const auto &c : constrs_) {
+      new_set.constrs_.push_back(c.Substitute(subs));
+    }
+    return new_set;
+  }
+  void Populate(arith::Analyzer &analyzer) const {
+    for (const auto &c : constrs_) {
+      c.Populate(analyzer);
+    }
+  }
+  bool CanProve(const PrimExpr &expr) const {
+    arith::Analyzer analyzer;
+    Populate(analyzer);
+    return analyzer.CanProve(expr);
+  }
+  template <typename... Args> void AddConstr(Args... args) {
+    constrs_.push_back(Constr(args...));
+  }
+  void Extend(const ConstrSet &other) {
+    for (const auto &c : other.constrs_) {
+      constrs_.push_back(c);
+    }
+  }
+
+  /*! \brief Convert the constraint set to a conjunction (AND) of all
+   * constraints */
+  PrimExpr ToConjunction() const {
+    if (constrs_.empty())
+      return Bool(true);
+    PrimExpr result = constrs_[0].ToGenericConstr();
+    for (size_t i = 1; i < constrs_.size(); ++i) {
+      result = tir::And(result, constrs_[i].ToGenericConstr());
+    }
+    return result;
+  }
+
+  void format(std::ostream &os) const {
+    os << "ConstrSet(size=" << constrs_.size() << ") {\n";
+    for (size_t i = 0; i < constrs_.size(); ++i) {
+      os << "  [" << i << "] ";
+      constrs_[i].format(os);
+      os << "\n";
+    }
+    os << "}";
+  }
+
+  std::vector<Constr> constrs_;
+};
+
+struct ConstrVisitor : public tir::StmtExprVisitor {
+private:
+  using Base = tir::StmtExprVisitor;
+
+  struct Guard {
+    std::vector<Constr> &constrs;
+    ~Guard() { constrs.pop_back(); }
+  };
+
+protected:
+  template <typename... Args> Guard MakeGuard(const Args... args) {
+    constr_stack_.push_back(Constr(args...));
+    return Guard{constr_stack_};
+  }
+
+public:
+  using StmtExprVisitor::VisitExpr_;
+  using StmtExprVisitor::VisitStmt_;
+  void VisitIfThenElseExpr(const PrimExpr cond, const PrimExpr true_value,
+                           const PrimExpr false_value) {
+    {
+      auto guard = MakeGuard(cond);
+      Base::VisitExpr(true_value);
+    }
+    {
+      auto guard = MakeGuard(tir::Not(cond));
+      Base::VisitExpr(false_value);
+    }
+  }
+  void VisitStmt_(const tir::LetStmtNode *op) override {
+    auto guard = MakeGuard(op->var, op->value);
+    Base::VisitStmt_(op);
+  }
+  void VisitStmt_(const tir::AttrStmtNode *op) override {
+    if (op->attr_key == tir::attr::tilelang_assume) {
+      auto expr = Downcast<PrimExpr>(op->node);
+      auto guard = MakeGuard(expr, true);
+      Base::VisitStmt_(op);
+    } else if (op->attr_key == tir::attr::thread_extent ||
+               op->attr_key == tir::attr::virtual_thread) {
+      tir::IterVar iv = Downcast<tir::IterVar>(op->node);
+      Range dom =
+          Range::FromMinExtent(tir::make_zero(op->value.dtype()), op->value);
+      auto guard = MakeGuard(iv->var, dom);
+      Base::VisitStmt_(op);
+    } else {
+      Base::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const tir::AssertStmtNode *op) override {
+    auto guard = MakeGuard(op->condition);
+    Base::VisitStmt_(op);
+  }
+  void VisitStmt_(const tir::IfThenElseNode *op) override {
+    {
+      auto guard = MakeGuard(op->condition);
+      Base::VisitStmt(op->then_case);
+    }
+    if (op->else_case) {
+      auto guard = MakeGuard(tir::Not(op->condition));
+      Base::VisitStmt(op->else_case.value());
+    }
+  }
+  void VisitExpr_(const tir::SelectNode *op) override {
+    VisitIfThenElseExpr(op->condition, op->true_value, op->false_value);
+  }
+  void VisitExpr_(const tir::CallNode *op) override {
+    static auto op_if_then_else = Op::Get("tir.if_then_else");
+    if (op->op.same_as(op_if_then_else)) {
+      VisitIfThenElseExpr(op->args[0], op->args[1], op->args[2]);
+    } else {
+      Base::VisitExpr_(op);
+    }
+  }
+  void VisitStmt_(const tir::ForNode *op) override {
+    if (op->kind == tir::ForKind::kParallel ||
+        op->kind == tir::ForKind::kVectorized) {
+      auto guard_1 =
+          MakeGuard(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+      auto guard_2 = MakeGuard(op->extent > 0);
+      Base::VisitStmt_(op);
+    } else {
+      auto guard_1 =
+          MakeGuard(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+      auto guard_2 = MakeGuard(op->extent > 0);
+      Base::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const tir::WhileNode *op) override {
+    {
+      auto guard = MakeGuard(op->condition);
+      Base::VisitStmt(op->body);
+    }
+  }
+  ConstrSet GetConstrSet() const {
+    return ConstrSet{.constrs_ = constr_stack_};
+  }
+  std::vector<Constr> constr_stack_;
+};
+} // namespace tvm::tl
+
+#endif // TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
diff --git a/src/transform/common/loop_fusion_utils.h b/src/transform/common/loop_fusion_utils.h
index 2fa6cdede..c4c5aec6b 100644
--- a/src/transform/common/loop_fusion_utils.h
+++ b/src/transform/common/loop_fusion_utils.h
@@ -218,6 +218,28 @@ class ParallelLoopFuser : public IRMutatorWithAnalyzer {
     // Create the fused loop
     For fused_for = For(fused_var, 0, fused_extent, ForKind::kParallel, body);
     fused_for.CopyOnWrite()->annotations = op->annotations;
+
+    // If the outermost loop carried a parallel loop layout annotation,
+    // reshape it to the fused 1D domain so the fused loop's layout remains
+    // valid. Using Fragment::Reshape preserves forward_index/thread semantics
+    // and performs necessary simplifications.
+    if (fused_for->annotations.count(attr::kParallelLoopLayout)) {
+      auto old_layout = Downcast<Fragment>(
+          fused_for->annotations.Get(attr::kParallelLoopLayout).value());
+      size_t old_dim = old_layout->InputDim();
+      // Only attempt to fuse when dimensions match the number of fused loops.
+      if (old_dim == loop_chain.size()) {
+        Array<PrimExpr> new_shape = {fused_extent};
+        Fragment fused_layout =
+            Downcast<Fragment>(old_layout->Reshape(new_shape, analyzer_));
+        fused_for.CopyOnWrite()->annotations.Set(attr::kParallelLoopLayout,
+                                                 fused_layout);
+      } else {
+        // Dimension mismatch: drop the stale layout so downstream passes can
+        // re-infer a correct one for the fused loop.
+        fused_for.CopyOnWrite()->annotations.erase(attr::kParallelLoopLayout);
+      }
+    }
     return fused_for;
   }
 };
diff --git a/src/transform/common/loop_parallel_transform_utils.h b/src/transform/common/loop_parallel_transform_utils.h
deleted file mode 100644
index 1e8d7a350..000000000
--- a/src/transform/common/loop_parallel_transform_utils.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*!
- * \file common.h
- * \brief Common utilities for TL transforms
- */
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/tir/stmt.h>
-
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/index_map.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
-#include <tvm/tir/utils.h>
-
-#include "arith/ir_mutator_with_analyzer.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include <queue>
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-using arith::IRVisitorWithAnalyzer;
-
-class ParallelLoopTransformer : public IRMutatorWithAnalyzer {
-public:
-  static Stmt Substitute(const Stmt &stmt, bool skip_thread_partition = false) {
-    arith::Analyzer analyzer;
-    ParallelLoopTransformer transformer(&analyzer);
-    return transformer.VisitStmt(stmt);
-  }
-
-  ParallelLoopTransformer(arith::Analyzer *analyzer)
-      : IRMutatorWithAnalyzer(analyzer) {}
-
-  Stmt VisitStmt_(const ForNode *op) final {
-
-    if (op->kind != ForKind::kParallel)
-      return StmtMutator::VisitStmt_(op);
-
-    // Collect loop variables and ranges
-    auto for_node = tvm::ffi::GetRef<For>(op);
-    Array<Var> loop_vars;
-    Array<PrimExpr> loop_extents;
-    Stmt body = op->body;
-
-    // Bind the range of outer loop variables
-    analyzer_->Bind(op->loop_var, Range::FromMinExtent(0, op->extent));
-    loop_vars.push_back(op->loop_var);
-    loop_extents.push_back(op->extent);
-
-    // If there are inner loops, bind their ranges as well
-    while (const ForNode *inner = body.as<ForNode>()) {
-      analyzer_->Bind(inner->loop_var, Range::FromMinExtent(0, inner->extent));
-      loop_vars.push_back(inner->loop_var);
-      loop_extents.push_back(inner->extent);
-      body = inner->body;
-    }
-
-    ICHECK(loop_vars.size() == loop_extents.size())
-        << "loop_vars and loop_extents size mismatch";
-
-    // Collect buffer access information
-    BufferAccessCollector collector;
-    collector(op->body);
-
-    PrimExpr condition;
-
-    for (const auto &[buffer, indices] : collector.buffer_indices) {
-      ICHECK(indices.size() == buffer->shape.size())
-          << "indices size mismatch with buffer shape";
-
-      for (size_t i = 0; i < indices.size(); ++i) {
-        auto index = indices[i];
-        auto bound = analyzer_->const_int_bound(index);
-
-        // Collect the variables that used in the index
-        std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> used_vars;
-        // post order visit the index
-        PostOrderVisit(index, [&](const ObjectRef &obj) {
-          if (const VarNode *v = obj.as<VarNode>()) {
-            used_vars.insert(tvm::ffi::GetRef<Var>(v));
-          }
-        });
-        if (used_vars.empty()) {
-          continue;
-        }
-
-        // find related loop vars
-        Array<Var> related_loop_vars;
-        for (size_t j = 0; j < loop_vars.size(); ++j) {
-          auto loop_var = loop_vars[j];
-          // if find related, pop the loop_vars and loop_extents
-          if (used_vars.count(loop_var)) {
-            related_loop_vars.push_back(loop_var);
-          }
-          if (related_loop_vars.size() > 1) {
-            // Only one related loop var is supported transformation currently.
-            return for_node;
-          }
-
-          auto bound = analyzer_->const_int_bound(index);
-          int64_t upper_bound = bound->max_value + 1;
-          int64_t shape = Downcast<IntImm>(buffer->shape[i])->value;
-          if (upper_bound < shape) {
-            PrimExpr predicate = LT(index, IntImm(index.dtype(), upper_bound));
-            condition =
-                condition.defined() ? And(condition, predicate) : predicate;
-          }
-        }
-      }
-    }
-
-    if (condition.defined()) {
-      body = IfThenElse(condition, body);
-
-      for (int j = loop_vars.size() - 1; j >= 0; --j) {
-        auto loop_var = loop_vars[j];
-        auto loop_extent = loop_extents[j];
-        body = For(loop_var, 0, loop_extent, ForKind::kParallel, body);
-      }
-
-      return Downcast<For>(body);
-    }
-
-    // Only traverse the outer loop
-    return for_node;
-  }
-
-  // Helper class for collecting buffer access information, only counts fragment
-  // buffer access
-  class BufferAccessCollector : public StmtExprVisitor {
-  public:
-    void VisitExpr_(const BufferLoadNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
-        if (buffer_indices.find(op->buffer) == buffer_indices.end()) {
-          buffer_indices[op->buffer] = op->indices;
-        } else {
-          // check equal
-          ICHECK(StructuralEqual()(buffer_indices[op->buffer], op->indices))
-              << "indices mismatch for buffer: " << op->buffer;
-        }
-      }
-      StmtExprVisitor::VisitExpr_(op);
-    }
-
-    void VisitStmt_(const BufferStoreNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
-        if (buffer_indices.find(op->buffer) == buffer_indices.end()) {
-          buffer_indices[op->buffer] = op->indices;
-        } else {
-          // check equal
-          ICHECK(StructuralEqual()(buffer_indices[op->buffer], op->indices))
-              << "indices mismatch for buffer: " << op->buffer;
-        }
-      }
-      StmtExprVisitor::VisitStmt_(op);
-    }
-
-    std::unordered_map<Buffer, Array<PrimExpr>, ObjectPtrHash, ObjectPtrEqual>
-        buffer_indices;
-  };
-};
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h
index b9b7715d0..c23252f41 100644
--- a/src/transform/common/loop_vectorization_utils.h
+++ b/src/transform/common/loop_vectorization_utils.h
@@ -781,4 +781,4 @@ class Vectorizer : public StmtMutator,
 };
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/transform/common/mbarrier.h b/src/transform/common/mbarrier.h
new file mode 100644
index 000000000..115e05909
--- /dev/null
+++ b/src/transform/common/mbarrier.h
@@ -0,0 +1,35 @@
+#ifndef TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
+#define TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
+
+#include "../../op/builtin.h"
+#include <tvm/ir/expr.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Create an mbarrier buffer with shared.barrier storage scope.
+ *
+ * \param name The name of the buffer.
+ * \param num_barriers The number of barriers in the buffer.
+ * \return A Buffer object for mbarrier with shared.barrier scope.
+ */
+inline Buffer CreateMBarrierBuffer(const std::string &name, int num_barriers) {
+  Var data(name, PointerType(PrimType(DataType::UInt(64)), "shared.barrier"));
+  return Buffer(data, DataType::UInt(64),
+                {IntImm(DataType::Int(32), num_barriers)}, {}, PrimExpr(), name,
+                0, 0, kDefault);
+}
+
+const std::string injected_mbarrier_name_ =
+    "mbarrier"; // todo: avoid conflict with user-defined mbarriers
+
+} // namespace tl
+} // namespace tvm
+#endif // TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
diff --git a/src/transform/eliminate_storage_sync_for_mbarrier.cc b/src/transform/eliminate_storage_sync_for_mbarrier.cc
index 504de732c..90c37cac8 100644
--- a/src/transform/eliminate_storage_sync_for_mbarrier.cc
+++ b/src/transform/eliminate_storage_sync_for_mbarrier.cc
@@ -2,7 +2,6 @@
  * \file eliminate_storage_sync_for_mbarrier.cc
  */
 #include "../op/builtin.h"
-#include "./storage_access.h"
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include <tvm/ffi/function.h>
diff --git a/src/transform/hoist_nonrestrict_params.cc b/src/transform/hoist_nonrestrict_params.cc
new file mode 100644
index 000000000..90db747e8
--- /dev/null
+++ b/src/transform/hoist_nonrestrict_params.cc
@@ -0,0 +1,133 @@
+/*
+ * Hoist tl.non_restrict_params block annotation(s) to PrimFunc attribute.
+ *
+ * Previously, we only looked at the root block. This version recursively
+ * scans all blocks, unions any tl.non_restrict_params entries it finds,
+ * merges with any existing PrimFunc-level attribute, then writes the
+ * deduplicated result back to the PrimFunc attrs. This makes annotation
+ * placement within the function body flexible for frontends.
+ */
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+namespace tvm {
+namespace tl {
+using namespace tvm::tir;
+
+class NonRestrictCollector : public StmtVisitor {
+public:
+  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
+
+  Array<Var> Result() const {
+    Array<Var> out;
+    out.reserve(collected_.size());
+    for (const Var &v : collected_)
+      out.push_back(v);
+    return out;
+  }
+
+private:
+  static std::string NormalizeName(const std::string &s) {
+    if (s.size() >= 8 && s.rfind("_handle") == s.size() - 7) {
+      return s.substr(0, s.size() - 7);
+    }
+    return s;
+  }
+
+  void MaybeInsert(const Var &v) {
+    if (!v.defined())
+      return;
+    const VarNode *p = v.get();
+    if (seen_ptr_.count(p))
+      return;
+    // Also dedup by normalized name to be robust w.r.t recreated Vars
+    std::string norm = NormalizeName(v->name_hint);
+    if (seen_name_.count(norm))
+      return;
+    seen_ptr_.insert(p);
+    seen_name_.insert(std::move(norm));
+    collected_.push_back(v);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    auto it = op->annotations.find(attr::kNonRestrictParams);
+    if (it != op->annotations.end()) {
+      if (const auto *arr = (*it).second.as<ffi::ArrayObj>()) {
+        // Downcast directly to Array<Var> for convenience
+        Array<Var> vars = tvm::Downcast<Array<Var>>((*it).second);
+        for (const Var &v : vars) {
+          MaybeInsert(v);
+        }
+      }
+    }
+    // Recurse into child statements
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  std::vector<Var> collected_;
+  std::unordered_set<const VarNode *> seen_ptr_;
+  std::unordered_set<std::string> seen_name_;
+};
+
+static PrimFunc HoistNonRestrictParams(PrimFunc f) {
+  if (!f.defined())
+    return f;
+
+  NonRestrictCollector collector;
+  collector.Collect(f->body);
+  Array<Var> from_blocks = collector.Result();
+
+  // Merge with any existing PrimFunc-level attribute if present
+  if (auto opt_existing = f->GetAttr<Array<Var>>(attr::kNonRestrictParams)) {
+    for (const Var &v : opt_existing.value()) {
+      // Reuse the collector's dedup logic by temporarily constructing a new
+      // collector Alternatively, do a small inline dedup mirroring MaybeInsert
+      // Here we inline a simplified pointer-based dedup plus name-based
+      // fallback
+      bool exists = false;
+      for (const Var &cur : from_blocks) {
+        if (cur.get() == v.get() || cur->name_hint == v->name_hint) {
+          exists = true;
+          break;
+        }
+      }
+      if (!exists)
+        from_blocks.push_back(v);
+    }
+  }
+
+  if (from_blocks.empty())
+    return f;
+
+  return WithAttr(std::move(f), attr::kNonRestrictParams,
+                  std::move(from_blocks));
+}
+
+namespace transform {
+
+tvm::transform::Pass HoistNonRestrictParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &,
+                      const tvm::transform::PassContext &) {
+    return tvm::tl::HoistNonRestrictParams(std::move(f));
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tl.HoistNonRestrictParams", {});
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.HoistNonRestrictParams",
+                        tvm::tl::transform::HoistNonRestrictParams);
+}
diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index 485e270c3..2a5fc62ca 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -1,4 +1,10 @@
+/*!
+ * \file inject_assumes.cc
+ * \brief Inject assumes on buffer's shape boundary check. Also convert
+ * existing assumes to AttrNodes.
+ */
 
+#include "common/assume.h"
 #include "tvm/arith/analyzer.h"
 #include "tvm/ffi/optional.h"
 #include "tvm/ir/expr.h"
@@ -6,9 +12,11 @@
 #include "tvm/node/structural_hash.h"
 #include "tvm/tir/builtin.h"
 #include "tvm/tir/expr.h"
+#include "tvm/tir/op.h"
 #include "tvm/tir/stmt.h"
 #include "tvm/tir/stmt_functor.h"
 #include "tvm/tir/transform.h"
+
 #include <sstream>
 
 namespace tvm::tl {
@@ -26,11 +34,12 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
   }
 
 private:
-  struct AssertCreator {
+  struct AssumeCreator {
     struct Item {
       PrimExpr expr;
       std::vector<Buffer> buffers;
     };
+
     tvm::StructuralHash sh;
     tvm::StructuralEqual se;
     // grouped by expr, since the amount of variadic shape symbols is usually
@@ -52,6 +61,7 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
         items[*it].buffers.push_back(buffer);
       }
     }
+
     void addBuffer(Buffer buf) {
       for (auto shape : buf->shape) {
         if (shape->IsInstance<IntImmNode>())
@@ -59,10 +69,12 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
         addExpr(shape, buf);
       }
     }
+
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
-        auto simplified = analyzer.Simplify(GT(e.expr, 0));
+        auto simplified =
+            analyzer.Simplify(GT(e.expr, make_zero(e.expr->dtype)));
         std::stringstream ss;
         ss << "Buffer shape should be greater than 0: shape `" << e.expr
            << "` from buffer ";
@@ -77,32 +89,37 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
       return body;
     }
   };
+
   Stmt VisitStmt_(const DeclBufferNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
+    AssumeCreator c;
     c.addBuffer(op->buffer);
     return DeclBuffer(op->buffer, c.build(body), op->span);
   }
-  std::optional<PrimExpr> getAssumeExpr(Stmt stmt) {
-    auto eval = stmt.as<EvaluateNode>();
-    if (!eval)
-      return std::nullopt;
-    auto call = eval->value.as<CallNode>();
-    if (!call)
-      return std::nullopt;
-    if (!call->op.same_as(builtin::assume()))
-      return std::nullopt;
-    return call->args[0];
-  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
     struct AssumeGroup {
       std::optional<PrimExpr> e;
       std::vector<Stmt> stmts;
     };
     std::vector<AssumeGroup> groups = {AssumeGroup{std::nullopt, {}}};
-    for (auto i = 0; i < op->seq.size(); i++) {
+    for (size_t i = 0; i < op->seq.size(); i++) {
       auto stmt = VisitStmt(op->seq[i]);
-      if (auto e = getAssumeExpr(stmt)) {
+      // Convert assume in evaluate form to assume attribute.
+      // By default, we have the following IR:
+      //    T.assume(cond1)
+      //    Stmt1
+      //    Stmt2
+      //    T.assume(cond2)
+      // This SeqStmt will be converted to:
+      //    With(attr::tilelang_assume, cond1) {
+      //      Stmt1
+      //      Stmt2
+      //    }
+      //    With(attr::tilelang_assume, cond2) {
+      //      ...
+      //    }
+      if (auto e = GetAssumeExprInEvaluateForm(stmt)) {
         groups.push_back(AssumeGroup{*e, {}});
       } else {
         groups.back().stmts.push_back(stmt);
@@ -125,10 +142,14 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
                                        : SeqStmt(groups[0].stmts);
     // return SeqStmt(groups[0].stmts);
   }
+
   Stmt VisitStmt_(const BlockNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
-    if (root_node) {
+    AssumeCreator c;
+
+    // NOTE(chaofan): We only inject assumes from function arguments in the
+    // root block.
+    if (op->name_hint == "root") {
       for (auto item : f->buffer_map) {
         c.addBuffer(item.second);
       }
@@ -139,12 +160,13 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
     for (auto item : op->match_buffers) {
       c.addBuffer(item->buffer);
     }
+
     return Block(op->iter_vars, op->reads, op->writes, op->name_hint,
                  c.build(body), op->init, op->alloc_buffers, op->match_buffers,
                  op->annotations, op->span);
   }
+
   PrimFunc f;
-  bool root_node{true};
 };
 
 using namespace tir::transform;
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index 511ebc573..10f22967b 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file inject_software_pipeline.cc
  * \brief Transform annotated loops into pipelined one that parallelize
@@ -45,6 +26,82 @@ struct LetWrapper {
   PrimExpr value;
 };
 
+struct IfWrapper {
+  PrimExpr condition;
+  Span span;
+};
+
+/*!
+ * \brief Collector to find all buffers used in a statement.
+ *
+ * This is used to collect buffers that are actually used in the pipeline loop
+ * body, so that we can properly multi-version them for software pipelining.
+ */
+class BufferUsageCollector : public StmtExprVisitor {
+public:
+  BufferUsageCollector(
+      const Map<Var, Buffer> &buffer_data_to_buffer,
+      const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+          &allocated_buffers)
+      : buffer_data_to_buffer_(buffer_data_to_buffer),
+        allocated_buffers_(allocated_buffers) {}
+
+  Array<Buffer> Collect(const Stmt &stmt) {
+    this->VisitStmt(stmt);
+    Array<Buffer> result;
+    for (const auto &buffer : used_buffers_) {
+      result.push_back(buffer);
+    }
+    return result;
+  }
+
+private:
+  void VisitStmt_(const BufferStoreNode *op) final {
+    AddBuffer(op->buffer);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    AddBuffer(op->buffer);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Handle tvm_access_ptr which also accesses buffers
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() > 1) {
+        if (const auto *var = op->args[1].as<VarNode>()) {
+          auto it = buffer_data_to_buffer_.find(GetRef<Var>(var));
+          if (it != buffer_data_to_buffer_.end()) {
+            AddBuffer((*it).second);
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    // Also collect buffers allocated in nested blocks within the pipeline body
+    for (const auto &buffer : op->alloc_buffers) {
+      used_buffers_.insert(buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void AddBuffer(const Buffer &buffer) {
+    // Only add buffers that are allocated (not function input/output buffers)
+    if (allocated_buffers_.count(buffer)) {
+      used_buffers_.insert(buffer);
+    }
+  }
+
+  const Map<Var, Buffer> &buffer_data_to_buffer_;
+  const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+      &allocated_buffers_;
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> used_buffers_;
+};
+
 /*!
  * \brief Create a block and infer the access region with the given body.
  *
@@ -79,6 +136,8 @@ struct PipelineAnnotation {
   int stage;
   int order;
   bool async;
+  // Index of the statement in the original loop body order (SeqStmt order)
+  int original_idx = -1;
 };
 
 using PipelineInfo = std::unordered_map<Block, PipelineAnnotation,
@@ -165,7 +224,7 @@ class PipelineBodyRewriter : public StmtExprMutator {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->span);
+    return Call(call->dtype, call->op, new_args, call->annotations, call->span);
   }
 
   Stmt VisitStmt_(const BlockNode *op) final {
@@ -236,14 +295,33 @@ class PipelineBodyRewriter : public StmtExprMutator {
  */
 class PipelineRewriter : public StmtExprMutator {
 public:
+  /*!
+   * \brief Constructor of PipelineRewriter.
+   * \param buffer_data_to_buffer The map from buffer data to buffer.
+   * \param pipeline_allocs All buffers that need multi-versioning in the
+   * pipeline. This includes buffers allocated in the pipeline block and
+   * buffers allocated in outer blocks that are used in the pipeline.
+   * \param local_allocs Buffers that are allocated in the pipeline block
+   * itself. These buffers will be re-allocated in the rewritten block.
+   * Buffers in pipeline_allocs but not in local_allocs are allocated in outer
+   * blocks and should not be re-allocated.
+   * \param pipeline_loop The original loop to be software pipelined.
+   * \param pipeline_info The pipeline annotation information.
+   * \param loop_var_let_wrappers Let wrappers that depend on the loop var.
+   * \param loop_var_if_wrappers If wrappers with conditions that depend on
+   * the loop var.
+   */
   PipelineRewriter(Map<Var, Buffer> buffer_data_to_buffer,
                    const Array<Buffer> &pipeline_allocs,
-                   const For &pipeline_loop, const PipelineInfo &pipeline_info,
-                   const std::vector<LetWrapper> &loop_var_let_wrappers)
+                   const Array<Buffer> &local_allocs, const For &pipeline_loop,
+                   const PipelineInfo &pipeline_info,
+                   const std::vector<LetWrapper> &loop_var_let_wrappers,
+                   const std::vector<IfWrapper> &loop_var_if_wrappers)
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
-        pipeline_allocs_(pipeline_allocs), pipeline_loop_(pipeline_loop),
-        pipeline_info_(pipeline_info),
-        loop_var_let_wrappers_(loop_var_let_wrappers) {}
+        pipeline_allocs_(pipeline_allocs), local_allocs_(local_allocs),
+        pipeline_loop_(pipeline_loop), pipeline_info_(pipeline_info),
+        loop_var_let_wrappers_(loop_var_let_wrappers),
+        loop_var_if_wrappers_(loop_var_if_wrappers) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the
@@ -251,7 +329,12 @@ class PipelineRewriter : public StmtExprMutator {
     std::unordered_map<Buffer, BufferAccessInfo, ObjectPtrHash, ObjectPtrEqual>
         infos = GetBufferAccessInfo();
     for (const Buffer &buffer : pipeline_allocs_) {
-      int num_versions = ComputeBufferVersions(buffer, infos.at(buffer));
+      auto it = infos.find(buffer);
+      if (it == infos.end()) {
+        // Buffer is not accessed in the pipeline blocks, skip it
+        continue;
+      }
+      int num_versions = ComputeBufferVersions(buffer, it->second);
       if (num_versions > 1) {
         buffer_remap_.Set(buffer, RewriteAllocBuffer(buffer, num_versions));
       }
@@ -304,21 +387,27 @@ class PipelineRewriter : public StmtExprMutator {
     }
 
     // Step 2: Emit the pipeline prologue, body and epilogue.
-    Stmt prologue = EmitImpl(pipeline_loop_->min,
-                             pipeline_loop_->min + max_stage_, true, true);
-    Stmt body =
-        EmitImpl(pipeline_loop_->min + max_stage_,
-                 pipeline_loop_->min + pipeline_loop_->extent, false, false);
-    Stmt epilogue = EmitImpl(
-        pipeline_loop_->min + pipeline_loop_->extent,
-        pipeline_loop_->min + pipeline_loop_->extent + max_stage_, true, true);
-
+    Stmt prologue =
+        EmitImpl(pipeline_loop_->min, pipeline_loop_->min + max_stage_, true,
+                 true, false);
+    Stmt body = EmitImpl(pipeline_loop_->min + max_stage_,
+                         pipeline_loop_->min + pipeline_loop_->extent, false,
+                         false, false);
+
+    Stmt epilogue =
+        EmitImpl(pipeline_loop_->min + pipeline_loop_->extent,
+                 pipeline_loop_->min + pipeline_loop_->extent + max_stage_,
+                 true, true, true);
     SeqStmt stmt = SeqStmt({prologue, body, epilogue});
 
     // Step 3: Make a new block that contains new buffer allocations after
     // pipeline rewriting.
+    // Only include buffers that are locally allocated in the pipeline block.
+    // Buffers from outer blocks will be handled separately.
     Array<Buffer> alloc_buffers;
-    for (const auto &alloc : pipeline_allocs_) {
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set(
+        local_allocs_.begin(), local_allocs_.end());
+    for (const auto &alloc : local_allocs_) {
       alloc_buffers.push_back(buffer_remap_.Get(alloc).value_or(alloc));
       buffer_data_to_buffer_.erase(alloc->data);
     }
@@ -327,6 +416,12 @@ class PipelineRewriter : public StmtExprMutator {
     return BlockRealize({}, Bool(true), block);
   }
 
+  /*!
+   * \brief Get the buffer remapping created during pipeline rewriting.
+   * This is used to update alloc_buffers in outer blocks.
+   */
+  const Map<Buffer, Buffer> &GetBufferRemap() const { return buffer_remap_; }
+
 private:
   /*!
    * \brief Analyze accesses to the buffers in the software pipeline.
@@ -515,12 +610,16 @@ class PipelineRewriter : public StmtExprMutator {
     // A symbolic expression representing the index the latest async operation
     // associated with this stage has written into, at the "current" iteration.
     Optional<PrimExpr> producer_head;
+    // the commit block's predicate
+    PrimExpr commit_predicate{nullptr};
   };
 
   /*! Structure holding intermediate information for pipeline loop rewriting. */
   struct RewrittenBlockInfo {
     int stage;
     int order;
+    PrimExpr start;
+    PrimExpr end;
     PrimExpr predicate;
     Block block;
     PrimExpr access_index;
@@ -528,56 +627,103 @@ class PipelineRewriter : public StmtExprMutator {
   };
 
   void PopulateWaitCounts(const std::vector<RewrittenBlockInfo> &new_blocks,
-                          std::map<int, AsyncStateLocal> *async_states_local) {
+                          std::map<int, AsyncStateLocal> *async_states_local,
+                          bool is_epilogue = false) {
+    // Precompute which orders are present in this emit, and their access_index
+    std::unordered_map<int, PrimExpr> order_to_access_index;
+    std::unordered_set<int> present_orders;
+    for (const auto &nb : new_blocks) {
+      order_to_access_index[nb.order] = nb.access_index;
+      present_orders.insert(nb.order);
+    }
     for (size_t i = 0; i < new_blocks.size(); ++i) {
+      // 1. Find the unique async producer stage
       int producer_stage_idx = -1;
-      for (auto read_region : new_blocks[i].block->reads) {
+      for (const auto &read_region : new_blocks[i].block->reads) {
         for (const auto &[stage, state] : async_states) {
           if (stage <= new_blocks[i].stage &&
               state.writes(read_region->buffer)) {
-            // Found an earlier stage where read_region->buffer was
-            // asynchronously written
+            // Currently only a single async stage dependency is supported
             ICHECK(producer_stage_idx == -1 || producer_stage_idx == stage)
                 << "A dependency on multiple async stages is not supported";
             producer_stage_idx = stage;
           }
         }
       }
-      if (producer_stage_idx == -1)
+      if (producer_stage_idx == -1) {
+        // This block does not depend on any async producer
         continue;
+      }
       const auto &state = async_states[producer_stage_idx];
+
       auto &dep_local_state = (*async_states_local)[producer_stage_idx];
-      PrimExpr in_flight_cnt = 0;
-      for (const auto &group : state.commit_groups) {
-        PrimExpr consumer_head = new_blocks[i].access_index;
-        PrimExpr producer_head;
-        if (dep_local_state.producer_head.defined()) {
-          producer_head = dep_local_state.producer_head.value();
-          // if the group is after the wait point, minus by 1
-          if (group.front() > new_blocks[i].order)
-            producer_head -= 1;
-        } else {
-          producer_head = state.producer_head;
-        }
-        in_flight_cnt += producer_head - consumer_head;
-      }
 
-      // We can relax the in-flight-count by the number of independent commit.
+      // 2. Use buffer_to_commit_group_ to find all actually dependent commit
+      // groups
       std::unordered_set<int> dependent_groups;
       for (const auto &read_region : new_blocks[i].block->reads) {
-        if (state.buffer_to_commit_group_.count(read_region->buffer.get()))
-          dependent_groups.insert(
-              state.buffer_to_commit_group_.at(read_region->buffer.get()));
+        auto it = state.buffer_to_commit_group_.find(read_region->buffer.get());
+        if (it != state.buffer_to_commit_group_.end()) {
+          dependent_groups.insert(it->second);
+        }
       }
-      for (int i = int(state.commit_groups.size()) - 1; i >= 0; i--) {
-        if (dependent_groups.count(i) == 0)
-          in_flight_cnt += 1;
-        else
-          break; // stop relaxing
+
+      // If there is no dependent commit group, no wait needs to be inserted
+      if (dependent_groups.empty()) {
+        continue;
       }
-      in_flight_cnt = analyzer_.Simplify(in_flight_cnt);
-      dep_local_state.pending_waits.push_back(
-          {static_cast<int>(i), in_flight_cnt});
+
+      // 3. Compute wait = max_g max(0, t_consumer - committed_before[g])
+      PrimExpr t_consumer = new_blocks[i].access_index;
+      PrimExpr wait_expr = make_zero(t_consumer.dtype());
+
+      PrimExpr current_head = dep_local_state.producer_head.defined()
+                                  ? dep_local_state.producer_head.value()
+                                  : state.producer_head;
+      int consumer_order = new_blocks[i].order;
+
+      for (int g : dependent_groups) {
+        const auto &group = state.commit_groups[g];
+        if (group.empty())
+          continue;
+        int commit_order = group.back();
+        bool commit_present = present_orders.count(commit_order) > 0;
+
+        PrimExpr committed_before;
+        if (commit_present && commit_order <= consumer_order) {
+          // Commit point is in this iteration and earlier than the current
+          // consumer; this iteration's head is visible
+          auto commit_predicate = dep_local_state.commit_predicate;
+          if (analyzer_.CanProve(!commit_predicate,
+                                 arith::ProofStrength::kSymbolicBound)) {
+            // it means the commit block is not executed in this iteration
+            committed_before = new_blocks[i].start - 1;
+          } else if (is_epilogue) {
+            committed_before = new_blocks[i].start - 1;
+          } else {
+            committed_before = order_to_access_index.at(commit_order);
+          }
+        } else {
+          // Commit point is later than the current consumer or not in this
+          // iteration; only the previous iteration's head is visible
+          if (dep_local_state.producer_head.defined()) {
+            auto commit_predicate = dep_local_state.commit_predicate;
+            if (analyzer_.CanProve(!commit_predicate,
+                                   arith::ProofStrength::kSymbolicBound)) {
+              committed_before = new_blocks[i].start - 1;
+            } else if (is_epilogue) {
+              committed_before = new_blocks[i].start - 1;
+            } else {
+              committed_before = current_head - 1;
+            }
+          }
+        }
+
+        wait_expr = analyzer_.Simplify(committed_before - t_consumer);
+      }
+
+      wait_expr = analyzer_.Simplify(wait_expr);
+      dep_local_state.pending_waits.push_back({static_cast<int>(i), wait_expr});
     }
   }
 
@@ -630,7 +776,7 @@ class PipelineRewriter : public StmtExprMutator {
    * \return The result loop.
    */
   Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
-                bool need_bound_check) {
+                bool need_bound_check, bool is_epilogue = false) {
     PrimExpr new_loop_var;
     PrimExpr extent = end - start;
     auto make_nop = []() {
@@ -642,7 +788,20 @@ class PipelineRewriter : public StmtExprMutator {
       new_loop_var = start; // use constants as the loop var for unit loops
     } else {
       new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
-      analyzer_.Bind(Downcast<Var>(new_loop_var), Range(start, end));
+      // Bind the iteration domain [start, end) to strengthen analyzer facts.
+      analyzer_.Bind(Downcast<Var>(new_loop_var),
+                     Range::FromMinExtent(start, end - start));
+    }
+    // Keep the bound constraints active for all analysis below.
+    // Only meaningful when the loop var is symbolic (non-unit loop).
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
+    if (!is_unit_loop) {
+      Var loop_iter = Downcast<Var>(new_loop_var);
+      ctx_lb_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
+      ctx_ub_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
     }
 
     std::vector<RewrittenBlockInfo> new_blocks;
@@ -653,15 +812,14 @@ class PipelineRewriter : public StmtExprMutator {
     for (const Block &block : ordered_stmts_) {
       int stage = pipeline_info_.at(block).stage;
       int order = pipeline_info_.at(block).order;
+
       PrimExpr inbound = Bool(true);
       PrimExpr skewed_loop_var = new_loop_var - stage;
       if (need_bound_check)
-        inbound =
-            analyzer_.Simplify(pipeline_loop_->min <= skewed_loop_var) &&
-            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent);
-      if (analyzer_.CanProve(!inbound)) {
-        continue;
-      }
+        inbound = And(
+            pipeline_loop_->min <= skewed_loop_var,
+            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+
       Block new_block = Downcast<Block>(
           PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
                                pipeline_loop_, max_stage_ != 1)(block));
@@ -674,6 +832,8 @@ class PipelineRewriter : public StmtExprMutator {
       PrimExpr normalized_access_index =
           is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
 
+      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+
       // Adjust the block predicate and the body according to the final loop
       // bound
       //  [pipeline_loop_->min, extent).
@@ -687,10 +847,21 @@ class PipelineRewriter : public StmtExprMutator {
       // If there were Let-wrappers outside the original pipeline body that
       // depended on the pipeline loop var, push them into each rewritten
       // block with the correct per-block substitution.
+      // We iterate in reverse order so that earlier definitions scope over
+      // later ones. For example, if we have:
+      //   id = ids[i]       # depends on loop var
+      //   id2 = ids2[id]    # depends on id
+      // We want to produce:
+      //   LetStmt(id, ids[...],
+      //     LetStmt(id2, ids2[id],
+      //       body))
+      // So that id2's definition can reference id.
       if (!loop_var_let_wrappers_.empty()) {
         BlockNode *n = new_block.CopyOnWrite();
         Stmt inner = n->body;
-        for (const auto &lw : loop_var_let_wrappers_) {
+        for (auto it = loop_var_let_wrappers_.rbegin();
+             it != loop_var_let_wrappers_.rend(); ++it) {
+          const auto &lw = *it;
           PrimExpr substituted = Substitute(
               lw.value, {{pipeline_loop_->loop_var, normalized_access_index}});
           inner = LetStmt(lw.var, substituted, inner);
@@ -698,20 +869,37 @@ class PipelineRewriter : public StmtExprMutator {
         n->body = inner;
       }
 
+      // Similarly, handle If-wrappers whose conditions depend on the
+      // pipeline loop var.
+      if (!loop_var_if_wrappers_.empty()) {
+        BlockNode *n = new_block.CopyOnWrite();
+        Stmt inner = n->body;
+        for (auto it = loop_var_if_wrappers_.rbegin();
+             it != loop_var_if_wrappers_.rend(); ++it) {
+          const auto &iw = *it;
+          PrimExpr substituted_condition =
+              Substitute(iw.condition,
+                         {{pipeline_loop_->loop_var, normalized_access_index}});
+          inner = IfThenElse(substituted_condition, inner, Stmt(), iw.span);
+        }
+        n->body = inner;
+      }
+
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
+        local_state.commit_predicate = inbound;
         BlockNode *n = new_block.CopyOnWrite();
         n->body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::async_scope,
                            1, n->body);
       }
 
-      new_blocks.push_back({stage, order, inbound, new_block,
+      new_blocks.push_back({stage, order, start, end, inbound, new_block,
                             normalized_access_index,
                             pipeline_info_[block].async});
     }
 
-    PopulateWaitCounts(new_blocks, &async_states_local);
+    PopulateWaitCounts(new_blocks, &async_states_local, is_epilogue);
 
     auto stmts = CompletePipelineLoopStatements(new_blocks, async_states_local);
 
@@ -753,6 +941,7 @@ class PipelineRewriter : public StmtExprMutator {
   arith::Analyzer analyzer_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Array<Buffer> pipeline_allocs_;
+  Array<Buffer> local_allocs_;
   For pipeline_loop_;
   PipelineInfo pipeline_info_;
   int max_stage_ = -1;
@@ -760,6 +949,7 @@ class PipelineRewriter : public StmtExprMutator {
   Array<Block> ordered_stmts_;
   std::map<int, AsyncStateGlobal> async_states;
   std::vector<LetWrapper> loop_var_let_wrappers_;
+  std::vector<IfWrapper> loop_var_if_wrappers_;
 };
 
 /*!
@@ -872,14 +1062,17 @@ class PipelineInjector : private StmtExprMutator {
     Stmt pipeline_body_root{nullptr};
     bool pipeline_body_from_block = false;
     Array<Buffer> pipeline_allocs;
+    Array<Buffer>
+        block_local_allocs; // buffers allocated in the pipeline block itself
     if (const auto *realize = for_node->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
       for (const auto &buffer : block->alloc_buffers) {
         ICHECK(buffer->IsInstance<BufferNode>());
         buffer_data_to_buffer_.Set(buffer->data, buffer);
+        allocated_buffers_.insert(buffer);
       }
       pipeline_body_root = block->body;
-      pipeline_allocs = block->alloc_buffers;
+      block_local_allocs = block->alloc_buffers;
       pipeline_body_from_block = true;
     } else {
       pipeline_body_root = for_node->body;
@@ -888,6 +1081,7 @@ class PipelineInjector : private StmtExprMutator {
     const SeqStmtNode *pipeline_body_seq = nullptr;
     std::vector<std::function<Stmt(Stmt)>> rewrap_fns;
     std::vector<LetWrapper> loop_var_let_wrappers;
+    std::vector<IfWrapper> loop_var_if_wrappers;
     auto append_attr_wrapper = [&rewrap_fns](const AttrStmtNode *attr) {
       Any node = attr->node;
       String attr_key = attr->attr_key;
@@ -910,24 +1104,55 @@ class PipelineInjector : private StmtExprMutator {
           ICHECK(!if_then_else->else_case.defined())
               << "InjectSoftwarePipeline: Can't handle the body of the loop "
                  "because the IfThenElse node has an else branch";
-          PrimExpr condition = if_then_else->condition;
-          Span span = if_then_else->span;
-          rewrap_fns.emplace_back(
-              [condition = std::move(condition), span](Stmt body) -> Stmt {
-                return IfThenElse(condition, body, Stmt(), span);
+
+          // Check if the condition depends on the loop variable or any
+          // transitively dependent variables (similar to LetStmt handling)
+          std::unordered_set<const VarNode *> dependent_vars;
+          dependent_vars.insert(op->loop_var.get());
+          for (const auto &lw : loop_var_let_wrappers) {
+            dependent_vars.insert(lw.var.get());
+          }
+          bool condition_depends_on_loop = UsesVar(
+              if_then_else->condition, [&dependent_vars](const VarNode *vn) {
+                return dependent_vars.count(vn) > 0;
               });
+
+          if (condition_depends_on_loop) {
+            // If condition depends on loop variable, we need to push it inside
+            // each pipeline stage with proper substitution
+            loop_var_if_wrappers.push_back(
+                {if_then_else->condition, if_then_else->span});
+          } else {
+            // Otherwise, safe to wrap outside the pipeline
+            PrimExpr condition = if_then_else->condition;
+            Span span = if_then_else->span;
+            rewrap_fns.emplace_back(
+                [condition = std::move(condition), span](Stmt body) -> Stmt {
+                  return IfThenElse(condition, body, Stmt(), span);
+                });
+          }
           current = if_then_else->then_case;
           continue;
         }
         if (const auto *let_stmt = current.as<LetStmtNode>()) {
-          // If this Let value uses the pipeline loop var, record it and push
-          // inside each rewritten block later so the loop var can be
-          // substituted with the correct per-iteration index. Otherwise, keep
-          // it as a normal wrapper.
-          bool uses_loop_var = UsesVar(
-              let_stmt->value,
-              [v = op->loop_var.get()](const VarNode *vn) { return vn == v; });
-          if (uses_loop_var) {
+          // If this Let value uses the pipeline loop var OR any variable
+          // defined by a previously recorded loop-var-dependent LetStmt,
+          // record it and push inside each rewritten block later so the
+          // loop var can be substituted with the correct per-iteration index.
+          // Otherwise, keep it as a normal wrapper.
+          // This handles transitive dependencies like:
+          //   id = ids[i]      # depends on loop var
+          //   id2 = ids2[id]   # depends on id, so transitively on loop var
+          std::unordered_set<const VarNode *> dependent_vars;
+          dependent_vars.insert(op->loop_var.get());
+          for (const auto &lw : loop_var_let_wrappers) {
+            dependent_vars.insert(lw.var.get());
+          }
+          bool depends_on_loop =
+              UsesVar(let_stmt->value, [&dependent_vars](const VarNode *vn) {
+                return dependent_vars.count(vn) > 0;
+              });
+          if (depends_on_loop) {
             loop_var_let_wrappers.push_back({let_stmt->var, let_stmt->value});
           } else {
             Var var = let_stmt->var;
@@ -970,13 +1195,36 @@ class PipelineInjector : private StmtExprMutator {
         ICHECK(nested_pipeline_block->match_buffers
                    .empty()); // match_buffer should have been lowered
         for (const auto &buffer : nested_pipeline_block->alloc_buffers) {
-          pipeline_allocs.push_back(buffer);
           buffer_data_to_buffer_.Set(buffer->data, buffer);
+          allocated_buffers_.insert(buffer);
         }
       }
       f_add_child(child);
     }
 
+    // Collect all buffers that are actually used in the pipeline loop body.
+    // This includes buffers allocated in outer blocks (like logits_smem) that
+    // are used inside the pipeline loop.
+    BufferUsageCollector collector(buffer_data_to_buffer_, allocated_buffers_);
+    pipeline_allocs = collector.Collect(SeqStmt(pipeline_body_seq->seq));
+
+    // Build a set of local allocs (buffers allocated in the pipeline block
+    // itself) for efficient lookup
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set;
+    for (const auto &buffer : block_local_allocs) {
+      local_allocs_set.insert(buffer);
+    }
+    for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
+      const Stmt &child = pipeline_body_seq->seq[i];
+      const auto *nested_block_realize = child.as<BlockRealizeNode>();
+      if (nested_block_realize && is_one(nested_block_realize->predicate) &&
+          nested_block_realize->block->body->IsInstance<SeqStmtNode>()) {
+        for (const auto &buffer : nested_block_realize->block->alloc_buffers) {
+          local_allocs_set.insert(buffer);
+        }
+      }
+    }
+
     auto pipeline_stages = Downcast<Array<Integer>>(
         op->annotations.at(tir::attr::software_pipeline_stage));
     auto pipeline_orders = Downcast<Array<Integer>>(
@@ -1008,17 +1256,41 @@ class PipelineInjector : private StmtExprMutator {
           pipeline_async_stages.find(stage) != pipeline_async_stages.end();
       PipelineAnnotation stage_order{
           stage,
-          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async};
+          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async,
+          /*original_idx=*/static_cast<int>(i)};
       pipeline_info.emplace(original_order[i], stage_order);
     }
 
     ValidatePipelineBody(pipeline_info, original_order);
 
     // Step 4: Rewrite the pipeline body.
-    Stmt pipeline = PipelineRewriter(buffer_data_to_buffer_, pipeline_allocs,
-                                     tvm::ffi::GetRef<For>(op), pipeline_info,
-                                     loop_var_let_wrappers)
-                        .BuildPipeline();
+    // local_allocs contains buffers allocated in the pipeline block itself.
+    // pipeline_allocs contains all buffers that need multi-versioning,
+    // including buffers from outer blocks.
+    Array<Buffer> local_allocs = block_local_allocs;
+    // Add nested block allocs to local_allocs
+    for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
+      const Stmt &child = pipeline_body_seq->seq[i];
+      const auto *nested_block_realize = child.as<BlockRealizeNode>();
+      if (nested_block_realize && is_one(nested_block_realize->predicate) &&
+          nested_block_realize->block->body->IsInstance<SeqStmtNode>()) {
+        const Block &nested_pipeline_block = nested_block_realize->block;
+        for (const auto &buffer : nested_pipeline_block->alloc_buffers) {
+          local_allocs.push_back(buffer);
+        }
+      }
+    }
+
+    PipelineRewriter rewriter(buffer_data_to_buffer_, pipeline_allocs,
+                              local_allocs, tvm::ffi::GetRef<For>(op),
+                              pipeline_info, loop_var_let_wrappers,
+                              loop_var_if_wrappers);
+    Stmt pipeline = rewriter.BuildPipeline();
+
+    // Store the buffer remapping for updating outer block alloc_buffers
+    for (const auto &kv : rewriter.GetBufferRemap()) {
+      pending_buffer_remap_.Set(kv.first, kv.second);
+    }
     auto apply_wrappers = [&](Stmt stmt) {
       for (auto it = rewrap_fns.rbegin(); it != rewrap_fns.rend(); ++it) {
         stmt = (*it)(stmt);
@@ -1045,6 +1317,7 @@ class PipelineInjector : private StmtExprMutator {
       const auto &block = realize->block;
       for (const auto &buffer : block->alloc_buffers) {
         buffer_data_to_buffer_.erase(buffer->data);
+        allocated_buffers_.erase(buffer);
       }
     }
     return pipeline;
@@ -1053,18 +1326,35 @@ class PipelineInjector : private StmtExprMutator {
   Stmt VisitStmt_(const BlockNode *op) final {
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
+      allocated_buffers_.insert(buffer);
     }
 
     Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
 
+    // Update alloc_buffers with any pending buffer remaps from pipeline
+    // rewriting. This handles buffers allocated in this block but
+    // multi-versioned during pipeline rewriting of inner loops.
+    Array<Buffer> new_alloc_buffers;
+    for (const auto &buffer : block->alloc_buffers) {
+      if (auto remapped = pending_buffer_remap_.Get(buffer)) {
+        new_alloc_buffers.push_back(remapped.value());
+        // Remove from pending after applying
+        pending_buffer_remap_.erase(buffer);
+      } else {
+        new_alloc_buffers.push_back(buffer);
+      }
+    }
+
     Array<Array<BufferRegion>> access =
         GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     BlockNode *n = block.CopyOnWrite();
     n->reads = access[0];
     n->writes = access[1];
+    n->alloc_buffers = std::move(new_alloc_buffers);
 
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.erase(buffer->data);
+      allocated_buffers_.erase(buffer);
     }
     return block;
   }
@@ -1089,6 +1379,12 @@ class PipelineInjector : private StmtExprMutator {
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> allocated_buffers_;
+  Map<Buffer, Buffer> pending_buffer_remap_;
+  // Buffers from outer blocks that have been used in a pipeline loop.
+  // Used to detect if the same buffer is used in multiple pipeline loops.
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+      buffers_used_in_pipeline_;
   Optional<String> global_symbol_;
 };
 } // namespace software_pipeline
diff --git a/src/transform/inject_ptx_async_copy.cc b/src/transform/inject_ptx_async_copy.cc
index 1fadefbf4..19346d462 100644
--- a/src/transform/inject_ptx_async_copy.cc
+++ b/src/transform/inject_ptx_async_copy.cc
@@ -29,7 +29,6 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include "storage_access.h"
 #include "tir/ir/buffer_common.h"
 #include "tvm/tir/stmt.h"
 
@@ -95,16 +94,31 @@ class PTXAsyncCopyInjector : public StmtMutator {
         if (indices_lanes == 1) {
           auto src_offset = load->indices[0];
           auto dst_offset = store->indices[0];
-          Array<PrimExpr> args = {
-              store->buffer->data, mul(dst_offset, PrimExpr(index_factor)),
-              load->buffer->data, src_offset, PrimExpr(bytes)};
-          // use arguments size to indicate whether or not to use predicated
-          // cp.async
+
+          // Calculate the number of elements based on bytes and dtype
+          int dst_elem_count = bytes / dst_elem_type->bytes();
+          int src_elem_count = bytes / src_elem_type->bytes();
+
+          // Create access_ptr for destination (shared memory, write access)
+          auto dst_access_ptr = store->buffer.access_ptr(
+              2, DataType::Handle(), 1, dst_offset, PrimExpr(dst_elem_count));
+
+          // Create access_ptr for source (global memory, read access)
+          auto src_access_ptr = load->buffer.access_ptr(
+              1, DataType::Handle(), 1, src_offset, PrimExpr(src_elem_count));
+
+          ffi::Array<PrimExpr> cp_async_args;
           if (predicated) {
-            args.push_back(predicate_value);
+            // Predicated cp.async with 4 arguments
+            cp_async_args = {dst_access_ptr, src_access_ptr, PrimExpr(bytes),
+                             predicate_value};
+          } else {
+            // Non-predicated cp.async with 3 arguments
+            cp_async_args = {dst_access_ptr, src_access_ptr, PrimExpr(bytes)};
           }
           return Evaluate(Call(store->buffer->dtype,
-                               tvm::tir::builtin::ptx_cp_async(), args));
+                               tvm::tir::builtin::ptx_cp_async(),
+                               cp_async_args));
         }
 
         // Predicated load don't support vectorized indexing.
@@ -134,14 +148,29 @@ class PTXAsyncCopyInjector : public StmtMutator {
             }
             return PrimExpr();
           }();
+
           if (src_offset.defined() && dst_offset.defined()) {
-            return Evaluate(Call(
-                store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
-                {store->buffer->data, mul(dst_offset, PrimExpr(index_factor)),
-                 load->buffer->data, src_offset, PrimExpr(bytes)}));
+            // Calculate the number of elements based on bytes and dtype
+            int dst_elem_count = bytes / dst_elem_type->bytes();
+            int src_elem_count = bytes / src_elem_type->bytes();
+
+            // Create access_ptr for destination (shared memory, write access)
+            auto dst_access_ptr = store->buffer.access_ptr(
+                2, DataType::Handle(), 1, dst_offset, PrimExpr(dst_elem_count));
+
+            // Create access_ptr for source (global memory, read access)
+            auto src_access_ptr = load->buffer.access_ptr(
+                1, DataType::Handle(), 1, src_offset, PrimExpr(src_elem_count));
+
+            ffi::Array<PrimExpr> cp_async_args{dst_access_ptr, src_access_ptr,
+                                               PrimExpr(bytes)};
+            return Evaluate(Call(store->buffer->dtype,
+                                 tvm::tir::builtin::ptx_cp_async(),
+                                 cp_async_args));
           }
         } else {
-          // Only some vectorized indexing patterns are supported for now.
+          // Predicated vectorized cp.async - extract offsets from vectorized
+          // indices
           auto src_offset = [=]() -> PrimExpr {
             if (load->indices[0]->IsInstance<RampNode>()) {
               return load->indices[0].as<RampNode>()->base;
@@ -154,8 +183,7 @@ class PTXAsyncCopyInjector : public StmtMutator {
               return store->indices[0].as<RampNode>()->base;
             } else if (store->indices[0].as<AddNode>()) {
               // The case where the dst buffer is a byte buffer generated by
-              // merging dynamic shared memory. A_shared.dyn[(ramp(...), 1, 8) +
-              // x8(17408))] = A_global[ramp(...),1, 8)]
+              // merging dynamic shared memory.
               auto *add = store->indices[0].as<AddNode>();
               if (!add->a->IsInstance<RampNode>())
                 return PrimExpr();
@@ -168,11 +196,31 @@ class PTXAsyncCopyInjector : public StmtMutator {
           }();
 
           if (src_offset.defined() && dst_offset.defined()) {
-            return Evaluate(Call(
-                store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
-                {store->buffer->data, mul(dst_offset, PrimExpr(index_factor)),
-                 load->buffer->data, src_offset, PrimExpr(bytes),
-                 predicate_value}));
+            // Calculate the number of elements based on bytes and dtype
+            int dst_elem_count = bytes / dst_elem_type->bytes();
+            int src_elem_count = bytes / src_elem_type->bytes();
+
+            // Create access_ptr for destination (shared memory, write access)
+            auto dst_access_ptr = store->buffer.access_ptr(
+                2, DataType::Handle(), 1, dst_offset, PrimExpr(dst_elem_count));
+
+            // Create access_ptr for source (global memory, read access)
+            auto src_access_ptr = load->buffer.access_ptr(
+                1, DataType::Handle(), 1, src_offset, PrimExpr(src_elem_count));
+
+            // Predicated vectorized cp.async with 4 arguments
+            ffi::Array<PrimExpr> cp_async_args{dst_access_ptr, src_access_ptr,
+                                               PrimExpr(bytes),
+                                               predicate_value};
+            return Evaluate(Call(store->buffer->dtype,
+                                 tvm::tir::builtin::ptx_cp_async(),
+                                 cp_async_args));
+          } else {
+            // If we can't extract offsets from vectorized indices, fall back
+            LOG(WARNING)
+                << "Cannot extract offsets from vectorized indices for "
+                   "predicated cp.async, "
+                << "falling back to regular buffer store/load";
           }
         }
       }
diff --git a/src/transform/inject_tma_barrier.cc b/src/transform/inject_tma_barrier.cc
index 93beb15d4..77ebf649f 100644
--- a/src/transform/inject_tma_barrier.cc
+++ b/src/transform/inject_tma_barrier.cc
@@ -173,7 +173,7 @@ class TmaExpectTxRewriter : public IRMutatorWithAnalyzer {
       new_args.Set(is_1d_tma_load ? 2 : 1,
                    Call(DataType::Handle(), get_mbarrier(),
                         {IntImm(DataType::Int(32), 0)}));
-      return Call(op->dtype, op->op, new_args);
+      return Call(op->dtype, op->op, new_args, op->annotations);
     }
     return IRMutatorWithAnalyzer::VisitExpr_(op);
   }
@@ -382,7 +382,7 @@ class BarrierCreationRewriter : public StmtExprMutator {
         }
       }
 
-      return Call(op->dtype, op->op, new_args);
+      return Call(op->dtype, op->op, new_args, op->annotations);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -521,7 +521,7 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
             new_args.Set(2, Call(DataType::Handle(), get_mbarrier(),
                                  {IntImm(DataType::Int(32),
                                          static_cast<int>(imm->value))}));
-            return Call(op->dtype, op->op, new_args);
+            return Call(op->dtype, op->op, new_args, op->annotations);
           }
         }
         return IRMutatorWithAnalyzer::VisitExpr_(op);
@@ -537,7 +537,7 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
       } else {
         new_args.Set(1, barrier_id);
       }
-      return Call(op->dtype, op->op, new_args);
+      return Call(op->dtype, op->op, new_args, op->annotations);
     } else if (op->op.same_as(mbarrier_expect_tx())) {
       auto call_ref = tvm::ffi::GetRef<Call>(op);
       if (!tma_op_to_barrier_id_.count(call_ref)) {
@@ -552,9 +552,9 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
         clear_arrive_ = clear_expect_list_[cur_expect_idx_++];
       if (clear_arrive_) {
         return Call(op->dtype, builtin::ptx_arrive_barrier_expect_tx(),
-                    new_args);
+                    new_args, op->annotations);
       }
-      return Call(op->dtype, op->op, new_args);
+      return Call(op->dtype, op->op, new_args, op->annotations);
     } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
       if (clear_arrive_) {
         clear_arrive_ = false;
@@ -562,7 +562,7 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
       }
       // by default, all threads must wait.
       auto new_args = op->args;
-      return Call(op->dtype, op->op, new_args);
+      return Call(op->dtype, op->op, new_args, op->annotations);
     }
     return IRMutatorWithAnalyzer::VisitExpr_(op);
   }
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index 45e71cc88..2a88ef59a 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -11,22 +11,25 @@
 #include <tvm/tir/transform.h>
 #include <tvm/tir/utils.h>
 
+#include <algorithm>
+#include <deque>
+#include <memory>
 #include <queue>
 
+#include "../layout/layout.h"
 #include "../layout/utils.h"
 #include "../op/copy.h"
 #include "../op/parallel.h"
 #include "../op/region.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include "common/loop_fusion_utils.h"
-#include "common/loop_parallel_transform_utils.h"
 #include "common/union_find.h"
 #include "layout_reducer.h"
-#include "loop_partition.h"
-#include "loop_vectorize.h"
-#include "runtime/thread_storage_scope.h"
+#include "parallel_loop_layout_validator.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
@@ -70,7 +73,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void RunInferStep(int cur_infer_id, InferLevel level, bool update_queue,
                     LayoutMap &layout_map, const LayoutMap &strict_layout_map,
-                    std::queue<int> &q, std::vector<bool> &in_queue) {
+                    std::deque<int> &q, std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
 
     // Range check for cur_infer_id
@@ -84,6 +87,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     auto &next = infer_list_[cur_infer_id];
     auto iter_var = thread_var_vec_[cur_infer_id];
     auto thread_bounds = thread_bounds_vec_[cur_infer_id];
+    arith::Analyzer *cur_analyzer = analyzer_vec_[cur_infer_id].get();
     auto buffer_oob = buffer_oob_vec_[cur_infer_id];
     // Double-check that 'next' is valid
     ICHECK(next.defined()) << "infer_list_[" << cur_infer_id
@@ -105,24 +109,68 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
            "required for layout inference.";
 
     // Run InferLayout
-    DLOG(INFO) << "[RunInferStep] working on " << cur_infer_id << '\n';
-    auto updates =
-        next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
-                                          &analyzer_, buffer_oob},
-                          level);
+    auto updates = next->InferLayout(LayoutInferArgs{target_,
+                                                     thread_bounds,
+                                                     layout_map,
+                                                     cur_analyzer,
+                                                     buffer_oob,
+                                                     {},
+                                                     let_var_to_expr_},
+                                     level);
+
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
-      DLOG(INFO) << "    consider update " << buffer << " as "
-                 << layout->DebugOutput() << '\n';
-
       // Basic validity checks
       ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
       ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
 
+      // Helper: propagate inferred layout to alias buffers (same data Var)
+      auto propagate_alias = [&](const Buffer &src_buffer,
+                                 const Layout &src_layout) {
+        if (!buffer_data_to_buffers_.count(src_buffer->data))
+          return;
+        const auto &siblings = buffer_data_to_buffers_[src_buffer->data];
+        for (const auto &sib : siblings) {
+          if (sib.same_as(src_buffer))
+            continue;
+          bool shapes_equal =
+              src_layout->InputShape().size() == sib->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < src_layout->InputShape().size(); ++i) {
+              if (!analyzer_.CanProveEqual(src_layout->InputShape()[i],
+                                           sib->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+          Layout target_layout =
+              shapes_equal
+                  ? src_layout
+                  : src_layout->Reshape(sib->shape, &analyzer_,
+                                        Integer(src_buffer->dtype.bytes()),
+                                        Integer(sib->dtype.bytes()));
+          if (layout_map.count(sib)) {
+            ICHECK(target_layout->IsEqual(layout_map[sib].get()))
+                << "Get different layout for alias buffer " << sib
+                << " (data-shared with " << src_buffer
+                << ")\n current: " << target_layout->DebugOutput()
+                << "\n previous: " << layout_map[sib]->DebugOutput();
+          } else {
+            layout_map.Set(sib, target_layout);
+            if (update_queue && use_list_.count(sib)) {
+              for (int idx : use_list_[sib]) {
+                EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
+              }
+            }
+          }
+        }
+      };
+
       if (layout_map.count(buffer)) {
         // If new layout contains the old one, update map
-        if (buffer.scope() == "local.fragment" &&
-            level != InferLevel::kStrict && !strict_layout_map.count(buffer)) {
+        if (IsFragmentBuffer(buffer) && level != InferLevel::kStrict &&
+            !strict_layout_map.count(buffer)) {
           // Actually this test has been done in ParallelOp::InferLayout
           // already. Just do it again to avoid missing implementations in other
           // `TileOperator`s.
@@ -153,25 +201,56 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           if (ProveFragmentContains(src_layout, dst_layout, indices, indices,
                                     inner_analyzer)) {
             layout_map.Set(buffer, layout);
-            DLOG(INFO) << "    layout broadcast from "
-                       << src_layout->DebugOutput() << ", accepted" << '\n';
+            // Propagate to alias buffers as well
+            propagate_alias(buffer, layout);
             continue;
           }
         }
-        // If already in map, ensure they are structurally equal
-        ICHECK(layout->IsEqual(layout_map[buffer].get()))
-            << "Get different layout for " << buffer
-            << "\n current layout: " << layout->DebugOutput()
-            << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+
+        // If already in map, check if they are structurally equal
+        if (!layout->IsEqual(layout_map[buffer].get())) {
+          // Try to merge swizzle layouts if both are swizzle layouts
+          const Layout &existing = layout_map[buffer];
+          if (!layout.as<Fragment>() && !existing.as<Fragment>()) {
+            auto input_shape = layout->InputShape();
+            if (input_shape.size() >= 2) {
+              size_t ndim = input_shape.size();
+              auto stride_expr = input_shape[ndim - 2].as<IntImmNode>();
+              auto continuous_expr = input_shape[ndim - 1].as<IntImmNode>();
+              if (stride_expr && continuous_expr) {
+                int stride = stride_expr->value;
+                int continuous = continuous_expr->value;
+                int element_size = buffer->dtype.bits();
+
+                if (auto merged = MergeSwizzleLayouts(
+                        existing, layout, stride, continuous, element_size)) {
+                  LOG(WARNING) << "Swizzle layout conflict for buffer "
+                               << buffer << ", merging to smaller granularity";
+                  layout_map.Set(buffer, merged.value());
+                  propagate_alias(buffer, merged.value());
+                  continue;
+                }
+              }
+            }
+          }
+          // If not swizzle layouts or merge failed, raise error
+          LOG(FATAL) << "Get different layout for " << buffer
+                     << "\n current layout: " << layout->DebugOutput()
+                     << "\n previous layout: "
+                     << layout_map[buffer]->DebugOutput();
+        }
+        // Ensure aliases are consistent too
+        propagate_alias(buffer, layout);
       } else {
         // Otherwise, update map
         layout_map.Set(buffer, layout);
-        DLOG(INFO) << "    new layout accepted" << '\n';
+        // Propagate to alias buffers (may enqueue their users)
+        propagate_alias(buffer, layout);
         if (!update_queue)
           continue;
 
         // Check if buffer exists in use_list_
-        if (!use_list_.count(buffer)) {
+        if (!use_list_.count(buffer) && IsFragmentBuffer(buffer)) {
           LOG(WARNING) << "Layout inference failed for buffer " << buffer
                        << ". "
                        << "The buffer cannot be inferred with current layout "
@@ -187,22 +266,20 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
               << "Index in use_list_ for buffer " << buffer
               << " out of range: " << idx << " >= " << num_infer << ".";
 
-          if (!in_queue[idx] && idx != cur_infer_id) {
-            in_queue[idx] = true;
-            q.push(idx);
-          }
+          EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
         }
       }
     }
   };
 
   void FinishInferQueue(InferLevel level, LayoutMap &layout_map,
-                        const LayoutMap &strict_layout_map, std::queue<int> &q,
+                        const LayoutMap &strict_layout_map, std::deque<int> &q,
                         std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
+
     while (!q.empty()) {
       int cur_infer_id = q.front();
-      q.pop();
+      q.pop_front();
       // Range check again, just to be safe
       ICHECK_GE(cur_infer_id, 0);
       ICHECK_LT(cur_infer_id, num_infer);
@@ -222,6 +299,9 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     ICHECK_EQ(thread_bounds_vec_.size(), infer_list_.size())
         << "Size mismatch: thread_bounds_vec_ and infer_list_ must match in "
            "length.";
+    ICHECK_EQ(analyzer_vec_.size(), infer_list_.size())
+        << "Size mismatch: analyzer_vec_ and infer_list_ must match in "
+           "length.";
     ICHECK_EQ(buffer_oob_vec_.size(), infer_list_.size())
         << "Size mismatch: buffer_oob_vec_ and infer_list_ must match in "
            "length.";
@@ -240,7 +320,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     int num_infer = infer_list_.size();
 
     // Prepare BFS queue for iterative inference
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(num_infer, true);
     for (int i = 0; i < num_infer; i++) {
       // Check that each infer_list_ entry is valid
@@ -252,7 +332,18 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       if (!thread_var_vec_[i].defined() && skip_thread_partition_) {
         thread_var_vec_[i] = thread_var_;
       }
-      q.push(i);
+      q.push_back(i);
+    }
+
+    // step 0: set fully replicated layout for floating fragment buffers
+    // Floating buffers are accessed outside TileOps (e.g., in if conditions),
+    // so they must be replicated across all threads.
+    for (const auto &[buffer, thread_bounds] : floating_fragment_buffers_) {
+      if (layout_map.count(buffer))
+        continue;
+      auto frag =
+          Fragment::FullyReplicated(buffer->shape, thread_bounds->extent);
+      layout_map.Set(buffer, frag);
     }
 
     // step 1: infer strict layout
@@ -268,13 +359,53 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     // step 2: infer common layout with BFS
     FinishInferQueue(InferLevel::kCommon, layout_map, strict_layout_map, q,
                      in_queue);
-
     // step 3: relax constraints to free and re-run
     InferInFreeMode(layout_map, strict_layout_map);
+    // step 4: finalize alias layouts by Var
+    // For each storage var, if any buffer in the group has a layout,
+    // propagate (reshape if needed) to the rest to ensure completeness.
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      // Find a representative with existing layout
+      Optional<Buffer> rep;
+      Optional<Layout> rep_layout;
+      for (const auto &buf : buffers) {
+        if (layout_map.count(buf)) {
+          rep = buf;
+          rep_layout = layout_map[buf];
+          break;
+        }
+      }
+      if (!rep_layout.defined())
+        continue;
+      for (const auto &buf : buffers) {
+        if (!layout_map.count(buf)) {
+          bool shapes_equal =
+              rep_layout.value()->InputShape().size() == buf->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < rep_layout.value()->InputShape().size();
+                 ++i) {
+              if (!analyzer_.CanProveEqual(rep_layout.value()->InputShape()[i],
+                                           buf->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+
+          Layout reshaped = shapes_equal
+                                ? rep_layout.value()
+                                : rep_layout.value()->Reshape(
+                                      buf->shape, &analyzer_,
+                                      Integer(rep.value()->dtype.bytes()),
+                                      Integer(buf->dtype.bytes()));
+          layout_map.Set(buf, reshaped);
+        }
+      }
+    }
 
     // Check that all local.fragment buffers have inferred layouts
     for (const auto &[buffer, _] : use_list_) {
-      if (buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(buffer)) {
         ICHECK_NE(layout_map.count(buffer), 0)
             << "The layout for fragment " << buffer
             << " can not be inferred correctly.";
@@ -314,28 +445,86 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void Collect(const PrimFunc &f) {
     for (const auto &[_, buffer] : f->buffer_map) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
+      if (buffer_data_to_buffers_.count(buffer->data)) {
+        auto buffers = buffer_data_to_buffers_[buffer->data];
+        buffers.push_back(buffer);
+        buffer_data_to_buffers_.Set(buffer->data, buffers);
+      } else {
+        buffer_data_to_buffers_.Set(buffer->data, {buffer});
+      }
     }
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target.defined())
         << "Layout_Inference: Require the target attribute";
     target_ = target.value();
     this->operator()(f->body);
+    // Compute floating fragment buffers after collection
+    ComputeFloatingFragmentBuffers(f->body);
   }
 
 private:
+  Map<Var, Buffer> GetBufferMap() const {
+    Map<Var, Buffer> buffer_map;
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      // Use the first buffer for each var
+      // TODO(lei): phaseout buffer_map in future.
+      if (!buffers.empty()) {
+        buffer_map.Set(var, buffers[0]);
+      }
+    }
+    return buffer_map;
+  }
+
+  // Return true if any buffer that this op (idx) touches already has
+  // an inferred layout in layout_map. Used to prioritize enqueue order.
+  bool HasKnownLayoutAnchor(int idx, const LayoutMap &layout_map) const {
+    auto it = op_touched_buffers_.find(idx);
+    if (it == op_touched_buffers_.end() || it->second.empty())
+      return false;
+    for (const auto &buf : it->second) {
+      if (layout_map.count(buf))
+        return true;
+    }
+    return false;
+  }
+
+  // Enqueue idx to q with priority if all its buffers already
+  // have layouts. Also guards against duplicates and self-enqueue.
+  void EnqueueWithPriority(int idx, std::deque<int> &q,
+                           std::vector<bool> &in_queue, int cur_infer_id,
+                           const LayoutMap &layout_map) const {
+    if (idx == cur_infer_id)
+      return;
+    if (idx < 0 || idx >= static_cast<int>(in_queue.size()))
+      return;
+    if (in_queue[idx])
+      return;
+    in_queue[idx] = true;
+    if (HasKnownLayoutAnchor(idx, layout_map)) {
+      q.push_front(idx);
+    } else {
+      q.push_back(idx);
+    }
+  }
+
   void VisitExpr_(const CallNode *op) final {
     IRVisitorWithAnalyzer::VisitExpr_(op);
     // Do not analysis the call node to the global function.
     if (op->op.as<GlobalVarNode>())
       return;
 
-    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op), buffer_data_to_buffer_);
+    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op));
     if (p.defined()) {
       for (const auto &arg : op->args) {
         if (auto buffer = getBufferFromAccessPtr(arg)) {
           addToUseList(buffer.value());
+        } else if (auto buffer = getBufferFromRegion(arg)) {
+          addToUseList(buffer.value());
         }
+        // Check if the argument uses any LetStmt variables that reference
+        // fragment buffers. If so, add those buffers to the use list.
+        // This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0], ...)
+        CollectFragmentBuffersFromExpr(arg);
       }
       // Compute thread_var_ and thread_bounds_
       thread_var_vec_.push_back(thread_var_);
@@ -350,6 +539,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
 
       // Compute buffer oob for each buffer in the op
       if (const auto *copy = p.as<CopyNode>()) {
@@ -387,6 +577,9 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
   }
 
   Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
+    if (auto bl = expr.as<BufferLoadNode>()) {
+      return bl->buffer;
+    }
     auto call = expr.as<CallNode>();
     if (!call) {
       return std::nullopt;
@@ -394,24 +587,52 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     if (call->op.same_as(builtin::tvm_access_ptr())) {
       auto var_opt = call->args[1].as<Var>();
       if (!var_opt.has_value()) {
-        DLOG(WARNING) << "[getBufferFromAccessPtr] args[1] is not a Var, type: "
-                      << call->args[1]->GetTypeKey();
+        LOG(WARNING) << "[getBufferFromAccessPtr] args[1] is not a Var, type: "
+                     << call->args[1]->GetTypeKey();
         return std::nullopt;
       }
       const auto &var = var_opt.value();
-      return buffer_data_to_buffer_[var];
-    } else if (call->op.same_as(RegionOp::Get())) {
-      return call->args[0].as<BufferLoadNode>()->buffer;
+      if (buffer_data_to_buffers_.count(var)) {
+        const auto &buffers = buffer_data_to_buffers_[var];
+        if (!buffers.empty()) {
+          return buffers[0]; // Return the first buffer
+        }
+      }
+      return std::nullopt;
+    }
+    return std::nullopt;
+  }
+
+  Optional<Buffer> getBufferFromRegion(const PrimExpr &expr) {
+    if (auto call = expr.as<CallNode>()) {
+      if (call->op.same_as(RegionOp::Get())) {
+        if (auto bl = call->args[0].as<BufferLoadNode>()) {
+          return bl->buffer;
+        }
+        return std::nullopt;
+      }
     }
     return std::nullopt;
   }
 
   void addToUseList(const Buffer &buffer) {
+    // buffer scope must be local.fragment
+    if (!IsFragmentBuffer(buffer)) {
+      return;
+    }
     int infer_idx = infer_list_.size();
     if (use_list_.find(buffer) == use_list_.end()) {
       use_list_[buffer] = {};
     }
     use_list_[buffer].push_back(infer_idx);
+
+    // Track which buffers this op (infer_idx) touches for prioritization.
+    // Avoid duplicates.
+    auto &vec = op_touched_buffers_[infer_idx];
+    if (std::none_of(vec.begin(), vec.end(),
+                     [&](const Buffer &b) { return b.same_as(buffer); })) {
+      vec.push_back(buffer);
+    }
   }
 
   void VisitStmt_(const ForNode *op) final {
@@ -420,6 +641,71 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       for (const auto &[buffer, _] : infer->GetIndiceMap()) {
         addToUseList(buffer);
       }
+
+      PostOrderVisit(op->body, [this](const ObjectRef &node) {
+        if (auto *buffer_load = node.as<BufferLoadNode>()) {
+          if (buffer_load->buffer.defined() &&
+              buffer_load->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_load->buffer->data)) {
+              // Check if this buffer is already in the list
+              auto buffers = buffer_data_to_buffers_[buffer_load->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_load->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_load->buffer);
+                buffer_data_to_buffers_.Set(buffer_load->buffer->data, buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_load->buffer
+                           << " buffer.get() = " << buffer_load->buffer.get()
+                           << " data = " << buffer_load->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_load->buffer->data,
+                                          {buffer_load->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_load->buffer
+                         << " buffer.get() = " << buffer_load->buffer.get()
+                         << " data = " << buffer_load->buffer->data.get();
+            }
+          }
+        } else if (auto *buffer_store = node.as<BufferStoreNode>()) {
+          if (buffer_store->buffer.defined() &&
+              buffer_store->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_store->buffer->data)) {
+              auto buffers =
+                  buffer_data_to_buffers_[buffer_store->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_store->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_store->buffer);
+                buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                            buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_store->buffer
+                           << " buffer.get() = " << buffer_store->buffer.get()
+                           << " data = " << buffer_store->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                          {buffer_store->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_store->buffer
+                         << " buffer.get() = " << buffer_store->buffer.get()
+                         << " data = " << buffer_store->buffer->data.get();
+            }
+          }
+        }
+      });
       infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
@@ -434,6 +720,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
       buffer_oob_vec_.push_back(false);
     } else {
       IRVisitorWithAnalyzer::VisitStmt(op->body);
@@ -442,21 +729,59 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void VisitStmt_(const BlockNode *op) final {
     for (auto buffer : op->alloc_buffers) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
+      if (buffer_data_to_buffers_.count(buffer->data)) {
+        auto buffers = buffer_data_to_buffers_[buffer->data];
+        buffers.push_back(buffer);
+        buffer_data_to_buffers_.Set(buffer->data, buffers);
+      } else {
+        buffer_data_to_buffers_.Set(buffer->data, {buffer});
+      }
     }
+
+    // First, visit the block body to collect all buffers from
+    // BufferLoad/BufferStore
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+
+    // After visiting, apply layouts to all collected buffers
     if (op->annotations.count(attr::kLayoutMap)) {
       // Check if the layout map is Map<Var, Layout>
       auto map =
           op->annotations.Get(attr::kLayoutMap)->as<Map<Var, Layout>>().value();
       for (const auto &[var, layout] : map) {
-        ICHECK(buffer_data_to_buffer_.count(var))
+        ICHECK(buffer_data_to_buffers_.count(var))
             << "buffer " << var << " is not found in the block";
-        auto buffer = buffer_data_to_buffer_[var];
-        ICHECK(StructuralEqual()(layout->InputShape(), buffer->shape));
-        annotated_layout_map_.Set(buffer, layout);
+        const auto &buffers = buffer_data_to_buffers_[var];
+        ICHECK(!buffers.empty()) << "buffer list for " << var << " is empty";
+        // Apply layout to all buffers associated with this var
+        for (const auto &buffer : buffers) {
+
+          // Reshape the layout to match the buffer's shape
+          // Check if shapes are structurally equal
+          bool shapes_equal =
+              layout->InputShape().size() == buffer->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < layout->InputShape().size(); ++i) {
+              if (!analyzer_.CanProveEqual(layout->InputShape()[i],
+                                           buffer->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+
+          if (shapes_equal) {
+            annotated_layout_map_.Set(buffer, layout);
+          } else {
+            // Use the first buffer sharing this var as the base for dtype ratio
+            int base_bytes = buffers[0]->dtype.bytes();
+            auto reshaped_layout =
+                layout->Reshape(buffer->shape, &analyzer_, Integer(base_bytes),
+                                Integer(buffer->dtype.bytes()));
+            annotated_layout_map_.Set(buffer, reshaped_layout);
+          }
+        }
       }
     }
-    IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
   void VisitStmt_(const AttrStmtNode *op) final {
@@ -470,17 +795,217 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
-  Map<Var, Buffer> buffer_data_to_buffer_;
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Record Let variable to its bound expression.
+    // This enables tracking fragment buffer accesses through let bindings.
+    let_var_to_expr_.Set(op->var, op->value);
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Helper: recursively collect fragment buffers from an expression,
+  // following let bindings chain.
+  void CollectFragmentBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (IsFragmentBuffer(bl->buffer)) {
+          addToUseList(bl->buffer);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr_.count(var)) {
+          CollectFragmentBuffersFromExpr(let_var_to_expr_[var]);
+        }
+      }
+    });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    // Collect buffer from BufferLoad
+    if (op->buffer.defined() && op->buffer->data.defined()) {
+      if (buffer_data_to_buffers_.count(op->buffer->data)) {
+        // Check if this buffer is already in the list
+        auto buffers = buffer_data_to_buffers_[op->buffer->data];
+        bool found = false;
+        for (const auto &buf : buffers) {
+          if (buf.same_as(op->buffer)) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          buffers.push_back(op->buffer);
+          buffer_data_to_buffers_.Set(op->buffer->data, buffers);
+          DLOG(INFO) << "[LayoutInference] BufferLoad: added buffer "
+                     << op->buffer << " buffer.get() = " << op->buffer.get()
+                     << " data = " << op->buffer->data.get();
+        }
+      } else {
+        buffer_data_to_buffers_.Set(op->buffer->data, {op->buffer});
+        DLOG(INFO) << "[LayoutInference] BufferLoad: new buffer " << op->buffer
+                   << " buffer.get() = " << op->buffer.get()
+                   << " data = " << op->buffer->data.get();
+      }
+    }
+    IRVisitorWithAnalyzer::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    // Collect buffer from BufferStore
+    if (op->buffer.defined() && op->buffer->data.defined()) {
+      if (buffer_data_to_buffers_.count(op->buffer->data)) {
+        // Check if this buffer is already in the list
+        auto buffers = buffer_data_to_buffers_[op->buffer->data];
+        bool found = false;
+        for (const auto &buf : buffers) {
+          if (buf.same_as(op->buffer)) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          buffers.push_back(op->buffer);
+          buffer_data_to_buffers_.Set(op->buffer->data, buffers);
+          DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                     << op->buffer << " buffer.get() = " << op->buffer.get()
+                     << " data = " << op->buffer->data.get();
+        }
+      } else {
+        buffer_data_to_buffers_.Set(op->buffer->data, {op->buffer});
+        DLOG(INFO) << "[LayoutInference] BufferStore: new buffer " << op->buffer
+                   << " buffer.get() = " << op->buffer.get()
+                   << " data = " << op->buffer->data.get();
+      }
+    }
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Compute floating fragment buffers after collection is done.
+  //
+  // A "floating" fragment buffer is one that has accesses outside of any
+  // TileOp (Copy, Gemm, Reduce, Parallel, etc.). For example:
+  //
+  //   T.copy(BlockMask[by, :], block_mask_f)  // block_mask_f accessed IN
+  //   TileOp for i in T.Pipelined(N_S):
+  //       if block_mask_f[i] >= 0:           // block_mask_f accessed OUTSIDE
+  //       TileOp (floating!)
+  //           T.copy(A[...], A_shared)
+  //
+  // In this example, `block_mask_f[i]` in the if-condition is a "floating"
+  // access because it's not inside any TileOp. Such buffers need special
+  // handling: they must be fully replicated across all threads since the
+  // access pattern cannot be inferred from TileOp semantics.
+  //
+  // This function identifies these buffers by:
+  // 1. Collecting all IR nodes that are inside TileOps (from infer_list_stmt_)
+  // 2. Scanning the entire function body for fragment buffer accesses
+  // 3. Any access not inside a TileOp means the buffer is "floating"
+  // 4. Recording the thread_bounds at the point of each floating access
+  void ComputeFloatingFragmentBuffers(const Stmt &func_body) {
+    // Step 1: Collect all nodes that are inside TileOps
+    std::unordered_set<const Object *> nodes_in_tileops;
+    for (const auto &stmt : infer_list_stmt_) {
+      PostOrderVisit(stmt, [&](const ObjectRef &node) {
+        nodes_in_tileops.insert(node.get());
+      });
+    }
+
+    // Step 2: Use a visitor to scan for floating accesses while tracking thread
+    // context
+    class FloatingBufferCollector : public IRVisitorWithAnalyzer {
+    public:
+      FloatingBufferCollector(
+          const std::unordered_set<const Object *> &nodes_in_tileops,
+          std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+              &floating_buffers)
+          : nodes_in_tileops_(nodes_in_tileops),
+            floating_buffers_(floating_buffers) {}
+
+      void VisitStmt_(const AttrStmtNode *op) final {
+        if (op->attr_key == tir::attr::thread_extent) {
+          IterVar iv = Downcast<IterVar>(op->node);
+          if (iv->thread_tag == "threadIdx.x") {
+            thread_var_ = iv;
+          }
+        }
+        IRVisitorWithAnalyzer::VisitStmt_(op);
+      }
+
+      void VisitExpr_(const BufferLoadNode *op) final {
+        CheckFloatingAccess(op->buffer, op);
+        IRVisitorWithAnalyzer::VisitExpr_(op);
+      }
+
+      void VisitStmt_(const BufferStoreNode *op) final {
+        CheckFloatingAccess(op->buffer, op);
+        IRVisitorWithAnalyzer::VisitStmt_(op);
+      }
+
+    private:
+      void CheckFloatingAccess(const Buffer &buffer, const Object *node) {
+        if (!IsFragmentBuffer(buffer))
+          return;
+        if (nodes_in_tileops_.find(node) != nodes_in_tileops_.end())
+          return;
+        // This is a floating access - record buffer with current thread_bounds
+        if (floating_buffers_.find(buffer) != floating_buffers_.end())
+          return; // Already recorded
+        Range thread_bounds = Range::FromMinExtent(0, 1);
+        if (thread_var_.defined() &&
+            analyzer_.const_int_bound.IsBound(thread_var_->var)) {
+          auto const_int_bound = analyzer_.const_int_bound(thread_var_);
+          auto dtype = thread_var_->var.dtype();
+          auto extent =
+              const_int_bound->max_value - const_int_bound->min_value + 1;
+          thread_bounds = Range::FromMinExtent(
+              IntImm(dtype, const_int_bound->min_value), IntImm(dtype, extent));
+        }
+        floating_buffers_[buffer] = thread_bounds;
+      }
+
+      const std::unordered_set<const Object *> &nodes_in_tileops_;
+      std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+          &floating_buffers_;
+      IterVar thread_var_;
+    };
+
+    FloatingBufferCollector collector(nodes_in_tileops,
+                                      floating_fragment_buffers_);
+    collector(func_body);
+
+    // Debug log floating fragment buffers
+    if (!floating_fragment_buffers_.empty()) {
+      DLOG(INFO)
+          << "Floating fragment buffers (have accesses outside TileOps):";
+      for (const auto &[buffer, thread_bounds] : floating_fragment_buffers_) {
+        DLOG(INFO) << "    " << buffer
+                   << " with thread_bounds: " << thread_bounds;
+      }
+    }
+  }
+
+  Map<Var, Array<Buffer>> buffer_data_to_buffers_;
+  // Map from LetStmt variable to its bound expression
+  Map<Var, PrimExpr> let_var_to_expr_;
   std::vector<ObjectRef> infer_list_stmt_;
   std::vector<TileOperator> infer_list_;
+  // Fragment buffers that have accesses outside of TileOps.
+  // These "floating" buffers need fully replicated layouts since their
+  // access patterns cannot be inferred from TileOp semantics.
+  // Maps buffer -> thread_bounds at the point of floating access.
+  // See ComputeFloatingFragmentBuffers() for detailed explanation.
+  std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+      floating_fragment_buffers_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
       use_list_;
+  // Per-op list of buffers it touches (fragment scope), used for prioritization
+  std::unordered_map<int, std::vector<Buffer>> op_touched_buffers_;
   // This is a workaround for cpu backend,
   // we need to define a thread_var for the serial loop.
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
                                 IterVarType::kDataPar);
   std::vector<IterVar> thread_var_vec_;
   std::vector<Range> thread_bounds_vec_;
+  std::vector<std::unique_ptr<arith::Analyzer>> analyzer_vec_;
   std::vector<bool> buffer_oob_vec_;
   Target target_;
   LayoutMap annotated_layout_map_;
@@ -513,12 +1038,34 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       if (infer_indices.empty())
         continue;
 
-      // Union all infer_list_ indices that share the same buffer
+      // Union all infer_list_ indices that share the same Buffer object
       int first_idx = infer_indices[0];
       for (size_t i = 1; i < infer_indices.size(); i++) {
         uf.Union(first_idx, infer_indices[i]);
       }
     }
+    // Additionally, union across buffers that share the same underlying
+    // buffer->data (Var). This handles cases like reshape where multiple
+    // Buffer objects alias the same storage.
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      std::vector<int> merged;
+      for (const auto &buf : buffers) {
+        auto it = use_list_.find(buf);
+        if (it != use_list_.end()) {
+          const auto &vec = it->second;
+          merged.insert(merged.end(), vec.begin(), vec.end());
+        }
+      }
+      if (merged.size() > 1) {
+        std::sort(merged.begin(), merged.end());
+        merged.erase(std::unique(merged.begin(), merged.end()), merged.end());
+        int first = merged[0];
+        for (size_t i = 1; i < merged.size(); ++i) {
+          uf.Union(first, merged[i]);
+        }
+      }
+    }
+
     std::unordered_map<int, std::vector<int>> components;
     for (int i = 0; i < infer_list_.size(); i++) {
       int root = uf.Find(i);
@@ -535,7 +1082,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
     // For each component, try each op as root, and determine the least
     // replicated one
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(infer_list_.size(), false);
 
     for (auto &&[root, members] : components) {
@@ -549,7 +1096,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       // Try each member as the root of inference for this component
       for (int attempt_infer_root : members) {
         DLOG(INFO) << "----------------------- try root " << attempt_infer_root
-                   << '\n';
+                   << " members " << members.size() << '\n';
         // Backup the current infer_list_ state
         auto back_infer_list = BackupInferList();
         // Copy the current layout_map for temporary use
@@ -580,6 +1127,10 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           do_update = false;
           DLOG(INFO) << "attempt failed due to NormalizeIterException "
                      << e.what() << '\n';
+        } catch (const LoopLayoutInjectiveException &e) {
+          do_update = false;
+          DLOG(INFO) << "attempt failed due to LoopLayoutInjectiveException "
+                     << e.what() << '\n';
         }
 
         if (do_update) {
@@ -590,7 +1141,13 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
               int64_t frag_reg_num = 1;
               for (auto i : frag.value()->OutputShape()) {
                 auto pci = as_const_int(i);
-                ICHECK(pci != nullptr);
+                ICHECK(pci != nullptr)
+                    << "Can not use non-constant range to "
+                       "iterate over a fragment/local "
+                       "buffer. Non-constant shape expr is: "
+                    << i
+                    << ". This is possibly because you use symbolic shape when "
+                       "accessing a fragment/local buffer.";
                 frag_reg_num *= *pci;
               }
               reg_num += frag_reg_num;
@@ -629,16 +1186,15 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
     BufferUseDefCollector collector(skip_thread_partition);
     collector.Collect(f);
     auto result = collector.Run();
-    LayoutInferencer substituter(result, skip_thread_partition, &analyzer);
+    LayoutInferencer substituter(result, &analyzer);
     fptr->body = substituter.VisitStmt(f->body);
     return f;
   }
 
 private:
   LayoutInferencer(const LayoutInferenceResult &result,
-                   bool skip_thread_partition, arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer), result_(result),
-        skip_thread_partition_(skip_thread_partition) {};
+                   arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer), result_(result) {};
 
   using arith::IRMutatorWithAnalyzer::IRMutatorWithAnalyzer;
 
@@ -671,175 +1227,68 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
   }
 
   /**
-   * @brief Visit and transform For nodes according to inferred layout
-   * information.
+   * @brief Visit and transform For nodes by storing inferred layout information
+   *        as annotations instead of expanding the loop.
    *
-   * If the For node is present in result_.for_map, this method applies
-   * loop-level layout-driven transformations: it optionally partitions the loop
-   * across the thread index, vectorizes the loop body, and wraps the loop with
-   * a predicate if one was inferred for the loop root.
+   * If the For node is present in result_.for_map, this method stores the
+   * inferred loop layout and predicate as annotations on the For node, rather
+   * than performing loop partition and vectorization.
    *
-   * Detailed behavior:
-   * - Reads reducer information from the For node's attr::kReducerInfo
-   * annotation (if present) to detect reduction targets.
-   * - Detects register-local buffer stores (buffers with scope "local") in the
-   *   original loop body; if only register-local stores are present the loop is
-   *   treated as a register-local scenario and is not partitioned across
-   * threads.
-   * - Obtains the loop layout from result_.for_map[root] and, unless the loop
-   * is register-local or skip_thread_partition_ is set, partitions the loop via
-   *   PartitionLoop using thread_var_ and analyzer_.
-   * - Scans the transformed loop body to determine whether it accesses any
-   *   non-local buffers (scopes other than "local" or "local.fragment").
-   * - Scans the transformed loop body to detect reducers (based on
-   * reducer_info). If a reducer is present the loop is NOT vectorized
-   * (reduction axes are excluded from vectorization as a conservative
-   * workaround).
-   * - If the loop has non-local accesses and no reducer, the loop is vectorized
-   *   via VectorizeLoop.
-   * - If a predicate exists in result_.predicate_map for the loop root and the
-   *   loop was partitioned, the method returns an IfThenElse surrounding the
-   *   (possibly partitioned/vectorized) loop with that predicate; otherwise it
-   *   returns the transformed For.
+   * The stored annotations are:
+   * - attr::kParallelLoopLayout: The Fragment layout for the parallel loop
+   * - attr::kParallelLoopPredicate: The predicate expression (if any)
    *
-   * @return The possibly transformed For statement (or an IfThenElse wrapping
-   * it)
+   * @return The For statement with layout annotations attached
    */
   Stmt VisitStmt_(const ForNode *op) final {
-    Map<Var, ReducerInfo> reducer_info;
-    if (op->annotations.count(attr::kReducerInfo))
-      reducer_info = op->annotations.Get(attr::kReducerInfo)
-                         ->as<Map<Var, ReducerInfo>>()
-                         .value();
+    if (!result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
+      return IRMutatorWithAnalyzer::VisitStmt_(op);
+    }
 
     For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    if (result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
-      auto root = tvm::ffi::GetRef<For>(op);
-      // This check is a workaround to support T.Parallel for local buffers.
-      // For example:
-      //   for i in T.Parallel(1024):
-      //     A_local[i] = A_global[i]
-      // Here, A_local is a register-local buffer held independently by each
-      // thread, so explicit thread binding is not required.
-      bool store_into_local = false;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() == "local") {
-            store_into_local = true;
-          }
-          // if the case is like:
-          // for i in T.Parallel(1024):
-          //     A_local[i] = B_global[i]
-          //     A_frag[i] = A_global[i]
-          // exception will be raise in Parallel::LayoutInference
-        }
-      });
-      // This check if for the loop that only manuplates "local" buffers,
-      // for i in T.Parallel(1024):
-      //     A_local[i] = B_local[i]
-      // Though this might be illegal
-      // We use PostOrderVisit to detect whether the loop only manuplates
-      // "local" buffers, which indicates register usage and justifies skipping
-      // thread binding.
-      bool local_register_only = true;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        } else if (const auto *load = obj.as<BufferLoadNode>()) {
-          if (load->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        }
-      });
+    auto root = tvm::ffi::GetRef<For>(op);
 
-      auto loop_layout = result_.for_map[root];
-      // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
-      // NOTE(lei): a bit ugly, we should rethink about this part in future.
-      bool parallel_loop =
-          !skip_thread_partition_ && !local_register_only && !store_into_local;
-
-      if (parallel_loop) {
-        for_node =
-            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
-      }
-      // If none thread bindings are provided, partition the loop
-      bool has_non_local = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *load = obj.as<BufferLoadNode>()) {
-          String scope = load->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        } else if (const auto *store = obj.as<BufferStoreNode>()) {
-          String scope = store->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        }
-      });
-      // Workaround: if reducer is presented, don't vectorize loop
-      // Best solution should be isolate reduction axis out of vectorization
-      bool has_reducer = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (!has_reducer)
-          if (const auto *store = obj.as<BufferStoreNode>()) {
-            has_reducer = reducer_info.count(store->buffer->data) != 0;
-          }
-      });
+    auto loop_layout = result_.for_map[root];
 
-      // If a cast operation exists, vectorization may still be required
-      bool has_cast_operations = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          // Check if this is a non-reducer store with Cast operation
-          if (store->value.as<CastNode>()) {
-            has_cast_operations = true;
-          }
-        }
-      });
+    // Store the loop layout as an annotation on the For node (outermost)
+    auto for_ptr = for_node.CopyOnWrite();
+    for_ptr->annotations.Set(attr::kParallelLoopLayout, loop_layout);
 
-      if ((has_non_local || has_cast_operations) && !has_reducer) {
-        for_node = VectorizeLoop(for_node);
-      }
-
-      if (result_.predicate_map.count(root) && parallel_loop) {
-        return IfThenElse(result_.predicate_map[root], for_node);
-      } else {
-        return for_node;
+    // Store the predicate as an annotation if it exists and is not trivially
+    // true
+    if (result_.predicate_map.count(root)) {
+      PrimExpr predicate = analyzer_->Simplify(result_.predicate_map[root]);
+      // Only store predicate if it's not trivially true
+      if (!is_const_int(predicate, 1)) {
+        for_ptr->annotations.Set(attr::kParallelLoopPredicate, predicate);
       }
     }
+
     return for_node;
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      ICHECK_NE(iv->thread_tag.length(), 0U);
-      if (iv->thread_tag == "threadIdx.x") {
-        thread_var_ = iv;
-      }
     }
     return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
 private:
   const LayoutInferenceResult result_;
-  IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
-                                IterVarType::kDataPar);
-  bool skip_thread_partition_{false};
 };
 
 tvm::transform::Pass LayoutInference() {
   using namespace tir::transform;
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    f.CopyOnWrite()->body = ParallelLoopTransformer::Substitute(f->body);
     ThreadBindingCollector collector;
     collector(f->body);
     bool has_thread_binding = !collector.thread_binding_.empty();
     bool skip_thread_partition = !has_thread_binding;
-    return LayoutInferencer::Substitute(std::move(f), skip_thread_partition);
+    f = LayoutInferencer::Substitute(std::move(f), skip_thread_partition);
+    // Validate parallel loop layout annotations
+    ParallelLoopLayoutValidator::Validate(f->body);
+    return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.LayoutInference", {});
 }
diff --git a/src/transform/layout_reducer.cc b/src/transform/layout_reducer.cc
index a3c69c43c..957918c97 100644
--- a/src/transform/layout_reducer.cc
+++ b/src/transform/layout_reducer.cc
@@ -213,8 +213,7 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
         const auto &buffer = opt_buffer.value();
         Fragment f;
         if (info->rep == ReducerRepType::ALL) {
-          f = Fragment(buffer->shape, {}, ReplicationPlaceholder(),
-                       thread_extent, std::nullopt);
+          f = Fragment::FullyReplicated(buffer->shape, thread_extent);
         } else if (info->rep == ReducerRepType::NONE) {
           PrimExpr flatten_idx = InputPlaceholder(0);
           for (int i = 1; i < buffer->shape.size(); ++i)
@@ -277,7 +276,7 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
     if (op->op.same_as(Fill::Get())) {
       ICHECK(!op->args.empty());
       if (auto arg0_call = op->args[0].as<Call>()) {
-        // Case 1: tl.region(...) — extract buffer var from its first arg
+        // tl.region(...) — extract buffer var from its first arg
         if (arg0_call.value()->op.same_as(RegionOp::Get())) {
           ICHECK(!arg0_call.value()->args.empty());
           if (auto bl = arg0_call.value()->args[0].as<BufferLoadNode>()) {
@@ -285,15 +284,14 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
             if (reducer_info_map_.count(var)) {
               ICHECK(inside_reducer_range_.count(var) == 0)
                   << "T.fill on reducer must be enclosed with a "
-                     "T.finalize_reducer "
-                     "before next.";
+                     "T.finalize_reducer before next.";
               inside_reducer_range_.Set(var,
                                         reducer_info_map_.Get(var).value());
             }
           }
         }
-        // Case 2: builtin.tvm_access_ptr(...) — existing path
-        else if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
+        // builtin.tvm_access_ptr(...) — existing path (legacy)
+        if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
           ICHECK(arg0_call.value()->args.size() > 1);
           if (auto var = arg0_call.value()->args[1].as<Var>();
               var && reducer_info_map_.count(var.value())) {
@@ -305,10 +303,33 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
                 var.value(), reducer_info_map_.Get(var.value()).value());
           }
         }
+      } else if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        Var var = bl->buffer->data;
+        if (reducer_info_map_.count(var)) {
+          ICHECK(inside_reducer_range_.count(var) == 0)
+              << "T.fill on reducer must be enclosed with a T.finalize_reducer "
+                 "before next.";
+          inside_reducer_range_.Set(var, reducer_info_map_.Get(var).value());
+        }
       }
     } else if (op->op.same_as(FinalizeReducerOp::Get())) {
       ICHECK(op->args.size() == 1);
-      auto var = GetVarFromAccessPtr(op->args[0]);
+      Var var;
+      if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        var = bl->buffer->data;
+      } else if (auto reg_call = op->args[0].as<Call>()) {
+        if (reg_call.value()->op.same_as(RegionOp::Get())) {
+          if (auto bl2 = reg_call.value()->args[0].as<BufferLoadNode>()) {
+            var = bl2->buffer->data;
+          } else {
+            LOG(FATAL) << "tl.region expects BufferLoad as first arg";
+          }
+        } else {
+          var = GetVarFromAccessPtr(op->args[0]);
+        }
+      } else {
+        var = GetVarFromAccessPtr(op->args[0]);
+      }
       ICHECK(inside_reducer_range_.count(var) == 1)
           << "T.finalize_reducer must have a pairing T.fill ahead of it, "
              "enclosing a reduction range.";
diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
index 36f879d01..f0df555ef 100644
--- a/src/transform/legalize_negative_index.cc
+++ b/src/transform/legalize_negative_index.cc
@@ -1,6 +1,6 @@
 /*!
  * \file legalize_negative_index.cc
- * \brief Legalize negative indices in buffer load expressions.
+ * \brief Legalize negative indices in buffer load/store expressions.
  */
 
 #include <tvm/ffi/reflection/registry.h>
@@ -10,6 +10,7 @@
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
 #include "arith/ir_mutator_with_analyzer.h"
@@ -23,47 +24,42 @@ using arith::IRVisitorWithAnalyzer;
 
 enum class IndexSignState { kNonNegative, kNegative, kUnknown };
 
+using BufferAccessVariant =
+    std::variant<const BufferLoadNode *, const BufferStoreNode *>;
+using LoadStore2StateMap =
+    std::unordered_map<BufferAccessVariant, std::vector<IndexSignState>>;
+
 class NegativeIndexAnalyzer : public IRVisitorWithAnalyzer {
 public:
-  explicit NegativeIndexAnalyzer(
-      std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-          *result)
+  explicit NegativeIndexAnalyzer(LoadStore2StateMap *result)
       : result_(result) {}
 
-  void VisitExpr_(const BufferLoadNode *op) final {
-    auto load = tvm::ffi::GetRef<BufferLoad>(op);
+private:
+  std::vector<IndexSignState> ProcessIdx(const ffi::Array<PrimExpr> &indices,
+                                         ffi::String buffer_name) {
     std::vector<IndexSignState> states;
-    states.reserve(op->indices.size());
-    bool needs_record = false;
+    states.reserve(indices.size());
 
-    for (size_t i = 0; i < op->indices.size(); ++i) {
-      PrimExpr simplified = analyzer_.Simplify(op->indices[i]);
+    for (size_t i = 0; i < indices.size(); ++i) {
+      PrimExpr simplified = analyzer_.Simplify(indices[i]);
+      IndexSignState state = IndexSignState::kUnknown;
 
       // Handle scalar indices with the standard analyzer
       if (simplified.dtype().lanes() == 1) {
-        if (analyzer_.CanProve(simplified >= 0)) {
-          states.push_back(IndexSignState::kNonNegative);
-          continue;
-        }
-        if (analyzer_.CanProve(simplified < 0)) {
-          states.push_back(IndexSignState::kNegative);
-          needs_record = true;
-          continue;
-        }
-        states.push_back(IndexSignState::kUnknown);
-        needs_record = true;
-        LOG(WARNING)
-            << "LegalizeNegativeIndex: cannot prove non-negative index "
-            << simplified << " for buffer " << load->buffer->name << " (axis "
-            << i << ").";
-        continue;
+        if (analyzer_.CanProve(simplified >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(simplified < 0))
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
       }
-
       // Vector indices: try to reason about non-negativity/negativity
       // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
       // lanes).
-      IndexSignState vec_state = IndexSignState::kUnknown;
-      if (const auto *ramp = simplified.as<RampNode>()) {
+      else if (const auto *ramp = simplified.as<RampNode>()) {
         // Compute a safe lower/upper bound for the vector lanes
         // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
         // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
@@ -85,118 +81,129 @@ class NegativeIndexAnalyzer : public IRVisitorWithAnalyzer {
         if (s_max > 0)
           upper += s_max * (lanes - 1);
 
-        if (lower >= 0) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (upper < 0) {
-          vec_state = IndexSignState::kNegative;
-        } else {
-          vec_state = IndexSignState::kUnknown;
-        }
-      } else if (const auto *bc = simplified.as<BroadcastNode>()) {
-        auto v = analyzer_.Simplify(bc->value);
-        if (analyzer_.CanProve(v >= 0)) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (analyzer_.CanProve(v < 0)) {
-          vec_state = IndexSignState::kNegative;
-        } else {
+        if (lower >= 0)
+          state = IndexSignState::kNonNegative;
+        else if (upper < 0)
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
+        auto v = analyzer_.Simplify(broadcast->value);
+        if (analyzer_.CanProve(v >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(v < 0))
+          state = IndexSignState::kNegative;
+        else {
           // Try const bound if proof unavailable
           auto vb = analyzer_.const_int_bound(v);
-          if (vb->min_value >= 0) {
-            vec_state = IndexSignState::kNonNegative;
-          } else if (vb->max_value < 0) {
-            vec_state = IndexSignState::kNegative;
-          } else {
-            vec_state = IndexSignState::kUnknown;
-          }
+          if (vb->min_value >= 0)
+            state = IndexSignState::kNonNegative;
+          else if (vb->max_value < 0)
+            state = IndexSignState::kNegative;
+          else
+            DLOG(WARNING)
+                << "LegalizeNegativeIndex: cannot prove non-negative index "
+                << simplified << " for buffer " << buffer_name << " (axis " << i
+                << ", index " + indices[i]->Script() + ").";
         }
       }
+      states.push_back(state);
+    }
 
-      if (vec_state == IndexSignState::kNonNegative) {
-        states.push_back(IndexSignState::kNonNegative);
-        continue;
-      }
-      if (vec_state == IndexSignState::kNegative) {
-        states.push_back(IndexSignState::kNegative);
-        needs_record = true;
-        continue;
-      }
+    return std::move(states);
+  }
 
-      states.push_back(IndexSignState::kUnknown);
-      needs_record = true;
-      DLOG(WARNING) << "LegalizeNegativeIndex: cannot prove non-negative index "
-                    << simplified << " for buffer " << load->buffer->name
-                    << " (axis " << i << ").";
-    }
+  bool NeedRecord(const std::vector<IndexSignState> &states) {
+    return std::any_of(states.begin(), states.end(),
+                       [](const IndexSignState &state) {
+                         return state == IndexSignState::kUnknown ||
+                                state == IndexSignState::kNegative;
+                       });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
 
-    if (needs_record) {
+    if (NeedRecord(states))
       (*result_)[op] = std::move(states);
-    }
 
     IRVisitorWithAnalyzer::VisitExpr_(op);
   }
 
+  void VisitStmt_(const BufferStoreNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
+
+    if (NeedRecord(states))
+      (*result_)[op] = std::move(states);
+
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
 private:
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      *result_;
+  LoadStore2StateMap *result_;
 };
 
 class NegativeIndexRewriter : public arith::IRMutatorWithAnalyzer {
 public:
-  static PrimFunc
-  Apply(PrimFunc func,
-        const std::unordered_map<const BufferLoadNode *,
-                                 std::vector<IndexSignState>> &states) {
+  static PrimFunc Apply(PrimFunc func, const LoadStore2StateMap &states) {
     arith::Analyzer analyzer;
     NegativeIndexRewriter rewriter(&analyzer, states);
-    if (!func->body.defined()) {
-      return func;
-    }
     PrimFuncNode *func_node = func.CopyOnWrite();
     func_node->body = rewriter.VisitStmt(func_node->body);
     return func;
   }
 
 private:
-  NegativeIndexRewriter(
-      arith::Analyzer *analyzer,
-      const std::unordered_map<const BufferLoadNode *,
-                               std::vector<IndexSignState>> &states)
+  NegativeIndexRewriter(arith::Analyzer *analyzer,
+                        const LoadStore2StateMap &states)
       : arith::IRMutatorWithAnalyzer(analyzer), states_(states) {}
 
+  ffi::Array<PrimExpr> UpdateIdx(const ffi::Array<PrimExpr> &indices,
+                                 const ffi::Array<PrimExpr> &buffer_shape,
+                                 const std::vector<IndexSignState> &state_vec) {
+    ICHECK_EQ(state_vec.size(), indices.size())
+        << "State vector size mismatch for buffer load/store indices ("
+        << indices << ")";
+    ffi::Array<PrimExpr> new_indices = indices;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      if (state_vec[i] != IndexSignState::kNegative)
+        continue;
+      new_indices.Set(i, analyzer_->Simplify(buffer_shape[i] + indices[i]));
+    }
+    return new_indices;
+  }
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     BufferLoad load =
         Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
 
     auto it = states_.find(op);
-    if (it == states_.end()) {
+    if (it == states_.end())
       return load;
-    }
 
-    auto indices = load->indices;
-    bool changed = false;
-
-    const auto &state_vector = it->second;
-    ICHECK_EQ(state_vector.size(), indices.size())
-        << "State vector size mismatch for buffer load " << load->buffer->name;
+    auto indices = UpdateIdx(load->indices, load->buffer->shape, it->second);
+    return BufferLoad(load->buffer, indices, load->predicate);
+  }
 
-    for (size_t i = 0; i < indices.size(); ++i) {
-      if (state_vector[i] != IndexSignState::kNegative) {
-        continue;
-      }
-      PrimExpr extent = load->buffer->shape[i];
-      indices.Set(i, analyzer_->Simplify(extent + indices[i]));
-      changed = true;
-    }
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store =
+        Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    if (!changed) {
-      return load;
-    }
+    auto it = states_.find(op);
+    if (it == states_.end())
+      return store;
 
-    return BufferLoad(load->buffer, indices);
+    auto indices = UpdateIdx(store->indices, store->buffer->shape, it->second);
+    return BufferStore(store->buffer, store->value, indices, store->predicate);
   }
 
-  const std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      &states_;
+private:
+  const LoadStore2StateMap &states_;
 };
 
 PrimFunc LegalizeNegativeIndex(PrimFunc func) {
@@ -204,8 +211,7 @@ PrimFunc LegalizeNegativeIndex(PrimFunc func) {
     return func;
   }
 
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      states;
+  LoadStore2StateMap states;
   NegativeIndexAnalyzer analyzer(&states);
   analyzer(func->body);
   if (states.empty()) {
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index 68a0cdbb8..d483aa294 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -24,50 +24,32 @@ namespace tl {
 using namespace tir;
 using arith::IRMutatorWithAnalyzer;
 
-// Helper class to find leaf For nodes in a given IR
-class LeafForFinder : public StmtVisitor {
-public:
-  std::vector<For> leaf_for_nodes;
-
-private:
-  void VisitStmt_(const ForNode *op) final {
-    has_child_for_ = false;
-    bool parent_has_child_for = parent_has_child_for_;
-    parent_has_child_for_ = false;
-
-    StmtVisitor::VisitStmt(op->body);
-
-    if (!has_child_for_) {
-      leaf_for_nodes.push_back(tvm::ffi::GetRef<For>(op));
-    }
-
-    parent_has_child_for_ = parent_has_child_for;
-    parent_has_child_for_ = true;
-  }
-
-private:
-  bool has_child_for_ = false;
-  bool parent_has_child_for_ = false;
-};
-
-// GlobalMemChecker for a BufferLoad/BufferStore node:
+// SafeMemChecker for a BufferLoad/BufferStore node:
 // 1. Identify BufferLoad and BufferStore nodes.
-// 2. Check if the buffer is in global scope.
-// 3. For each index, compare against the buffer's shape.
+// 2. For each index, compare against the buffer's shape.
 //    If the index might exceed the shape (upper bound too large),
-//    log a warning or handle accordingly.
-struct GlobalMemChecker : public StmtExprVisitor {
+//    log a warning (local/shared) or handle accordingly (global).
+struct SafeMemChecker : public StmtExprVisitor {
 
-  GlobalMemChecker(arith::Analyzer *analyzer, bool recursively_collect_conds)
+  bool disableOOBWarning = false;
+
+  SafeMemChecker(arith::Analyzer *analyzer, bool recursively_collect_conds)
       : analyzer_(analyzer),
-        recursively_collect_conds_(recursively_collect_conds) {}
+        recursively_collect_conds_(recursively_collect_conds) {
+    disableOOBWarning =
+        tvm::transform::PassContext::Current()
+            ->GetConfig(kDisableOutOfBoundWarning, Optional<Bool>())
+            .value_or(false);
+  }
   void VisitExpr_(const BufferLoadNode *op) final {
-    // Check if the buffer is in global scope
-    // This is because we are writing TilePrograms, where out of bounds
-    // accesses only happen in the global buffer.
-    if (IsGlobalBuffer(op->buffer)) {
-      CheckBufferIndices(op->buffer, op->indices, /*is_load=*/true);
-    }
+    // If the buffer is in global scope, we will check its indices and add
+    // corresponding bound checks.
+    // If the buffer is in shared/local, although out of bound accesses are
+    // still possible, we assume the developers can handle them. This is because
+    // we are writing TilePrograms. Therefore we only log warnings if there
+    // are possible out-of-bounds.
+    CheckBufferIndices(op->buffer, op->indices, /*is_load=*/true,
+                       !disableOOBWarning && !IsGlobalBuffer(op->buffer));
     if (recursively_collect_conds_) {
       StmtExprVisitor::VisitExpr_(op);
     }
@@ -75,9 +57,8 @@ struct GlobalMemChecker : public StmtExprVisitor {
 
   void VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    if (IsGlobalBuffer(op->buffer)) {
-      CheckBufferIndices(op->buffer, op->indices, /*is_load=*/false);
-    }
+    CheckBufferIndices(op->buffer, op->indices, /*is_load=*/false,
+                       !disableOOBWarning && !IsGlobalBuffer(op->buffer));
     if (recursively_collect_conds_) {
       StmtExprVisitor::VisitStmt_(op);
     }
@@ -96,7 +77,7 @@ struct GlobalMemChecker : public StmtExprVisitor {
 
   // Check each index against the buffer shape dimensions
   void CheckBufferIndices(const Buffer &buffer, const Array<PrimExpr> &indices,
-                          bool is_load) {
+                          bool is_load, bool throw_warning) {
     // Ensure indices count matches buffer dimension
     if (indices.size() != buffer->shape.size()) {
       LOG(WARNING) << "Buffer access dimension mismatch: indices size ("
@@ -109,13 +90,16 @@ struct GlobalMemChecker : public StmtExprVisitor {
       PrimExpr index = indices[i];
       PrimExpr shape_dim = buffer->shape[i];
 
-      bool has_variable = false;
+      bool is_index_constant = true;
       PostOrderVisit(index, [&](const ObjectRef &obj) {
         if (const VarNode *v = obj.as<VarNode>()) {
-          has_variable = true;
+          is_index_constant = false;
+        }
+        if (const BufferLoadNode *v = obj.as<BufferLoadNode>()) {
+          is_index_constant = false;
         }
       });
-      if (!has_variable) {
+      if (is_index_constant) {
         // If index is a constant, we can skip the check
         continue;
       }
@@ -126,13 +110,24 @@ struct GlobalMemChecker : public StmtExprVisitor {
       PrimExpr upper_bound_cond = index < shape_dim;
       if (!analyzer_->CanProve(upper_bound_cond,
                                arith::ProofStrength::kSymbolicBound)) {
-        _conditions.push_back(upper_bound_cond);
+        if (throw_warning) {
+          LOG(WARNING) << "Index access may exceed buffer bounds: " << index
+                       << " >= " << shape_dim
+                       << "; Buffer name: " << buffer->name;
+        } else {
+          _conditions.push_back(upper_bound_cond);
+        }
       }
       // Check if index >= 0 can be proven.
       PrimExpr lower_bound_cond = index >= 0;
       if (!analyzer_->CanProve(lower_bound_cond,
                                arith::ProofStrength::kSymbolicBound)) {
-        _conditions.push_back(lower_bound_cond);
+        if (throw_warning) {
+          LOG(WARNING) << "Index access may be negative: " << index << " < 0"
+                       << "; Buffer name: " << buffer->name;
+        } else {
+          _conditions.push_back(lower_bound_cond);
+        }
       }
     }
   }
@@ -145,22 +140,35 @@ struct GlobalMemChecker : public StmtExprVisitor {
   bool recursively_collect_conds_;
 };
 
-class SafeMemorysRewriter : public StmtExprMutator {
-  arith::Analyzer *analyzer_;
-
+class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
 public:
-  explicit SafeMemorysRewriter(Map<Buffer, PrimExpr> annotated_safe_value_map,
-                               arith::Analyzer *analyzer)
-      : annotated_safe_value_map_(std::move(annotated_safe_value_map)),
-        analyzer_(analyzer) {}
+  // Static method to substitute and transform the given PrimFunc
+  static PrimFunc Substitute(PrimFunc f) {
+    arith::Analyzer analyzer;
+    // Create an instance of the legalizer with the analyzer
+    SafeMemorysRewriter substituter(&analyzer);
+    // Get a mutable copy of the function node
+    PrimFuncNode *fptr = f.CopyOnWrite();
+    for (const auto &[_, buffer] : f->buffer_map) {
+      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    // Apply the legalizer to the function body
+    fptr->body = substituter.VisitStmt(f->body);
+    return f;
+  }
 
 private:
+  // Constructor initializing the base class with the analyzer
+  SafeMemorysRewriter(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
+  // Constructor initializing the base class with the analyzer
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto load = Downcast<BufferLoad>(IRMutatorWithAnalyzer::VisitExpr_(op));
 
     // For Load/Store, we only check the current node, not its children.
     // Since rewriter will recursively visit children.
-    GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
+    SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(load);
     Array<PrimExpr> conditions = checker.GetConditions();
 
@@ -181,9 +189,9 @@ class SafeMemorysRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    auto store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto store = Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
+    SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(store);
     Array<PrimExpr> conditions = checker.GetConditions();
 
@@ -225,16 +233,26 @@ class SafeMemorysRewriter : public StmtExprMutator {
   // current statement. The current solution adopts a simplified approach:
   // directly applying the boundary constraints of all parameters to the
   // statement. While not entirely precise, it addresses most common scenarios.
+  // Check if the call is an atomic operation
+  bool IsAtomicOp(const Op &op) {
+    return op == atomic_add_elem_op() || op == atomic_add_ret_elem_op() ||
+           op == atomic_addx2_elem_op() || op == atomic_addx4_elem_op() ||
+           op == atomic_load_elem_op() || op == atomic_store_elem_op() ||
+           op == atomic_max_elem_op() || op == atomic_max_ret_elem_op() ||
+           op == atomic_min_elem_op() || op == atomic_min_ret_elem_op();
+  }
+
   Stmt VisitStmt_(const EvaluateNode *op) final {
     auto evaluate = Downcast<Evaluate>(op);
 
     if (const CallNode *call_op = op->value.as<CallNode>()) {
       auto call = Downcast<Call>(op->value);
-      if (call->op == builtin::call_extern()) {
-        // For CallExtern, we recursively collect conditions from all children.
-        // Since we cannot rewrite any BufferLoad in its children (Rewrite will
-        // cause potential Nullptr exception).
-        GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/true);
+      if (call->op == builtin::call_extern() ||
+          (call->op.as<OpNode>() && IsAtomicOp(Downcast<Op>(call->op)))) {
+        // For CallExtern and atomic ops, we recursively collect conditions
+        // from all children. Since we cannot rewrite any BufferLoad in its
+        // children (Rewrite will cause potential Nullptr exception).
+        SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/true);
         checker(call);
         Array<PrimExpr> conditions = checker.GetConditions();
 
@@ -253,85 +271,6 @@ class SafeMemorysRewriter : public StmtExprMutator {
     return evaluate;
   }
 
-  bool IsLocalBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "local" || scope == "local.fragment" ||
-           scope == "local.var";
-  }
-
-  bool isSharedBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "shared" || scope == "shared.dyn";
-  }
-
-  bool IsGlobalBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "global";
-  }
-  // Get the safe value of the buffer
-  PrimExpr GetSafeValue(const Buffer &buffer) {
-    if (annotated_safe_value_map_.count(buffer)) {
-      return annotated_safe_value_map_[buffer];
-    }
-    return make_zero(buffer->dtype);
-  }
-
-  Map<Buffer, PrimExpr> annotated_safe_value_map_;
-};
-
-// Class to legalize safe memory access by transforming them appropriately
-class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
-public:
-  // Static method to substitute and transform the given PrimFunc
-  static PrimFunc Substitute(PrimFunc f) {
-    arith::Analyzer analyzer;
-    // Create an instance of the legalizer with the analyzer
-    SafeMemoryLegalizer substituter(&analyzer);
-    // Get a mutable copy of the function node
-    PrimFuncNode *fptr = f.CopyOnWrite();
-    for (const auto &[_, buffer] : f->buffer_map) {
-      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    // Apply the legalizer to the function body
-    fptr->body = substituter.VisitStmt(f->body);
-    return f;
-  }
-
-private:
-  // Constructor initializing the base class with the analyzer
-  SafeMemoryLegalizer(arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer) {}
-
-  // Override the VisitStmt_ method to handle ForNode (loop statements)
-  Stmt VisitStmt_(const ForNode *op) final {
-    // Visit and potentially modify the loop node
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    auto has_inner_loop = HasInnerLoop(for_node->body);
-    if (!has_inner_loop) {
-      SafeMemorysRewriter rewriter(annotated_safe_value_map_, analyzer_);
-      for_node.CopyOnWrite()->body = rewriter(for_node->body);
-      // // Detect Buffer Load Node in the loop body, collect the indices and
-      // buffer size
-
-      // // Run the checker on the loop body
-      // GlobalMemChecker checker(analyzer_);
-      // checker(for_node->body);
-      // Array<PrimExpr> conditions = checker.GetConditions();
-      // auto body = for_node->body;
-      // // Note that we might have duplicate conditions
-      // // Which will be optimized by simplify pass
-      // // Replace the loop body with the new body
-      // for (auto cond : conditions) {
-      //   body = IfThenElse(cond, body);
-      // }
-      // for_node.CopyOnWrite()->body = body;
-      return std::move(for_node);
-    }
-
-    // Visit a For Node
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
   Stmt VisitStmt_(const BlockNode *op) final {
     for (auto buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
@@ -351,10 +290,12 @@ class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
     return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
-  static bool HasInnerLoop(const Stmt &stmt) {
-    LeafForFinder finder;
-    finder(stmt);
-    return !finder.leaf_for_nodes.empty();
+  // Get the safe value of the buffer
+  PrimExpr GetSafeValue(const Buffer &buffer) {
+    if (annotated_safe_value_map_.count(buffer)) {
+      return annotated_safe_value_map_[buffer];
+    }
+    return make_zero(buffer->dtype);
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
@@ -371,7 +312,7 @@ tvm::transform::Pass LegalizeSafeMemoryAccess() {
     if (disable_safe_memory_legalize) {
       return f;
     }
-    return SafeMemoryLegalizer::Substitute(std::move(f));
+    return SafeMemorysRewriter::Substitute(std::move(f));
   };
   // Create and return a PrimFunc pass with the transformation function
   return CreatePrimFuncPass(pass_func, 0, "tl.LegalizeSafeMemoryAccess", {});
diff --git a/src/transform/legalize_vectorized_loop.cc b/src/transform/legalize_vectorized_loop.cc
index aa461784a..4fd4ab91f 100644
--- a/src/transform/legalize_vectorized_loop.cc
+++ b/src/transform/legalize_vectorized_loop.cc
@@ -73,7 +73,7 @@ class LoopVectorizedLegalizer : IRMutatorWithAnalyzer {
     // Change the loop kind from vectorized to serial
     for_node.CopyOnWrite()->kind = ForKind::kSerial;
     // Apply vectorization transformation to the loop
-    return VectorizeLoop(for_node);
+    return VectorizeLoop(for_node, analyzer_);
   }
 };
 
diff --git a/src/transform/loop_partition.cc b/src/transform/loop_partition.cc
index fe1fe0366..115b48b35 100644
--- a/src/transform/loop_partition.cc
+++ b/src/transform/loop_partition.cc
@@ -28,6 +28,9 @@
 
 #include <utility>
 
+#include "../op/utils.h"
+#include "loop_vectorize.h"
+
 namespace tvm {
 namespace tl {
 
@@ -80,8 +83,6 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   Array<PrimExpr> loop_extents;
   auto inverse_info = loop_layout->InverseWithLevel();
   auto inv_loop = inverse_info.first;
-  // Must check the guard if the layout can not be proved as bijective
-  bool need_guard = inverse_info.second != arith::IterMapLevel::Bijective;
   auto indices = inv_loop->Forward(Array<PrimExpr>(vars.begin(), vars.end()));
   // Normalize thread var once so we can reuse the same substitution later.
   Map<Var, PrimExpr> thread_offset_map;
@@ -93,7 +94,8 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   }
   for (int i = 0; i < old_loop_depth; i++) {
     const ForNode *loop = body.as<ForNode>();
-    ICHECK(loop != nullptr);
+    ICHECK(loop != nullptr)
+        << "No extra statements are allowed between nested parallel loops.";
     vmap.Set(loop->loop_var, indices[i]);
     loop_mins.push_back(loop->min);
     loop_extents.push_back(loop->extent);
@@ -114,36 +116,38 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   // inverse i, j land outside the original extents. This protects
   // non-surjective loop_layout mappings that otherwise over-cover the parallel
   // space.
+  // Always build guard and let analyzer decide if it can be proved true.
+  // This handles both non-bijective layouts and cases where loop extent
+  // differs from layout input shape (e.g., loop extent=4 with
+  // Fragment([8]->[1]) produces inverse index `tx % 8` ranging 0-7, requiring
+  // guard `tx % 8 < 4`).
   PrimExpr guard = const_true();
-
-  if (need_guard) {
-    for (int i = 0; i < old_loop_depth; i++) {
-      PrimExpr index = indices[i];
-      if (has_thread_offset) {
-        index = Substitute(index, thread_offset_map);
-      }
-      PrimExpr lower_bound = analyzer->Simplify(index >= loop_mins[i]);
-      PrimExpr upper_bound =
-          analyzer->Simplify(index < loop_mins[i] + loop_extents[i]);
-      guard = And(guard, And(lower_bound, upper_bound));
-    }
-    auto inv_output_shape = inv_loop->OutputShape();
-    if (inv_output_shape.size() > static_cast<size_t>(old_loop_depth)) {
-      PrimExpr replicate_index = indices[old_loop_depth];
-      if (has_thread_offset) {
-        replicate_index = Substitute(replicate_index, thread_offset_map);
-      }
-      PrimExpr replicate_extent = inv_output_shape[old_loop_depth];
-      PrimExpr lower_bound = analyzer->Simplify(
-          replicate_index >= make_zero(replicate_index.dtype()));
-      PrimExpr upper_bound =
-          analyzer->Simplify(replicate_index < replicate_extent);
-      guard = And(guard, And(lower_bound, upper_bound));
+  for (int i = 0; i < old_loop_depth; i++) {
+    PrimExpr index = indices[i];
+    if (has_thread_offset) {
+      index = Substitute(index, thread_offset_map);
     }
-    PrimExpr simplified_guard = analyzer->Simplify(guard);
-    if (!analyzer->CanProve(simplified_guard)) {
-      body = IfThenElse(simplified_guard, body, Stmt());
+    PrimExpr lower_bound = analyzer->Simplify(index >= loop_mins[i]);
+    PrimExpr upper_bound =
+        analyzer->Simplify(index < loop_mins[i] + loop_extents[i]);
+    guard = And(guard, And(lower_bound, upper_bound));
+  }
+  auto inv_output_shape = inv_loop->OutputShape();
+  if (inv_output_shape.size() > static_cast<size_t>(old_loop_depth)) {
+    PrimExpr replicate_index = indices[old_loop_depth];
+    if (has_thread_offset) {
+      replicate_index = Substitute(replicate_index, thread_offset_map);
     }
+    PrimExpr replicate_extent = inv_output_shape[old_loop_depth];
+    PrimExpr lower_bound = analyzer->Simplify(
+        replicate_index >= make_zero(replicate_index.dtype()));
+    PrimExpr upper_bound =
+        analyzer->Simplify(replicate_index < replicate_extent);
+    guard = And(guard, And(lower_bound, upper_bound));
+  }
+  PrimExpr simplified_guard = analyzer->Simplify(guard);
+  if (!analyzer->CanProve(simplified_guard)) {
+    body = IfThenElse(simplified_guard, body, Stmt());
   }
 
   for (int i = new_loop_depth - 1; i >= 0; i--) {
@@ -217,14 +221,14 @@ class LoopPartitioner : public StmtExprVisitor {
 
 private:
   void VisitExpr_(const BufferLoadNode *op) final {
-    if (op->buffer.scope() == "local.fragment") {
+    if (IsFragmentBuffer(op->buffer)) {
       has_fragment_ = true;
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode *op) final {
-    if (op->buffer.scope() == "local.fragment") {
+    if (IsFragmentBuffer(op->buffer)) {
       has_fragment_ = true;
     }
     StmtExprVisitor::VisitStmt_(op);
@@ -266,5 +270,40 @@ For LoopPragmaUnroll(For stmt) {
   return unrolled;
 }
 
+Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
+                       arith::Analyzer *analyzer, const LayoutMap &layout_map,
+                       Optional<PrimExpr> predicate, bool parallel_loop,
+                       bool should_vectorize) {
+  // Save analyzer state to prevent conflicted bindings during vectorization
+  auto saved_analyzer = analyzer->Clone();
+
+  For result_loop = loop;
+  // Strip parallel-loop layout/predicate annotations on the original loop.
+  // After partitioning/vectorization, keeping them can confuse later passes.
+  // Also, annotations may contain complex expressions; mutators do not visit
+  // inside annotation payloads, so explicit removal here prevents stale state
+  // from leaking into subsequent transforms.
+  // Note: Map::erase(key) is a no-op if key doesn't exist.
+  result_loop.CopyOnWrite()->annotations.erase(attr::kParallelLoopLayout);
+  result_loop.CopyOnWrite()->annotations.erase(attr::kParallelLoopPredicate);
+
+  // Step 1: Partition the loop based on the layout (if this is a parallel loop)
+  if (parallel_loop) {
+    result_loop = PartitionLoop(result_loop, thread_var, analyzer, loop_layout);
+  }
+
+  // Step 2: Vectorize the loop (if requested)
+  if (should_vectorize) {
+    result_loop = VectorizeLoop(result_loop, saved_analyzer.get(), layout_map);
+  }
+
+  // Step 3: Wrap with predicate if provided and this is a parallel loop
+  if (predicate.defined() && parallel_loop) {
+    return IfThenElse(predicate.value(), result_loop);
+  }
+
+  return result_loop;
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/loop_partition.h b/src/transform/loop_partition.h
index 1103e7515..ffc32ec45 100644
--- a/src/transform/loop_partition.h
+++ b/src/transform/loop_partition.h
@@ -26,8 +26,10 @@
 #define TVM_TL_LOOP_PARTITION_H_
 
 #include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
 
 #include "../layout/layout.h"
+#include "../op/operator.h"
 
 namespace tvm {
 namespace tl {
@@ -45,6 +47,32 @@ Fragment PlanLoopPartition(const For &op, int vectorize_size,
 
 For LoopPragmaUnroll(For stmt);
 
+/*!
+ * \brief Lower a parallel loop by partitioning and vectorizing it.
+ *
+ * This function combines PartitionLoop and VectorizeLoop into a single
+ * operation, and optionally wraps the result with an IfThenElse if a
+ * predicate is provided.
+ *
+ * \param loop The parallel For loop to lower.
+ * \param loop_layout The Fragment layout for partitioning.
+ * \param thread_var The thread variable for partitioning.
+ * \param analyzer The arithmetic analyzer.
+ * \param predicate Optional predicate to wrap the loop with IfThenElse.
+ * \param parallel_loop Whether this is a true parallel loop requiring thread
+ *        partitioning. False for loops that only operate on local/register
+ *        buffers. (default true)
+ * \param should_vectorize Whether to vectorize the loop. False when reducers
+ *        are present or when there are no non-local buffer accesses.
+ *        (default true)
+ * \return The lowered statement.
+ */
+Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
+                       arith::Analyzer *analyzer,
+                       const LayoutMap &layout_map = {},
+                       Optional<PrimExpr> predicate = Optional<PrimExpr>(),
+                       bool parallel_loop = true, bool should_vectorize = true);
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/transform/loop_unswitching.cc b/src/transform/loop_unswitching.cc
new file mode 100644
index 000000000..166f4ac69
--- /dev/null
+++ b/src/transform/loop_unswitching.cc
@@ -0,0 +1,666 @@
+/*!
+ * \file loop_unswitching.cc
+ * \brief Loop Unswitching: Hoist loop-invariant if statements out of loops
+ *
+ * Transformation:
+ *   for i in range(n):        if cond:
+ *       if cond:         =>       for i in range(n): A(i)
+ *           A(i)               else:
+ *       else:                     for i in range(n): B(i)
+ *           B(i)
+ *
+ * A condition is loop-invariant iff:
+ *   1. It does not use the loop variable
+ *   2. It does not read buffers written inside the loop
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Collect buffer data vars that are written in a statement
+ *
+ * Handles:
+ *   - BufferStore
+ *   - tvm_access_ptr with write flag (rw_mask & 2)
+ *   - address_of(BufferLoad) as call argument (conservative)
+ */
+class WrittenVarCollector : public StmtExprVisitor {
+public:
+  std::unordered_set<const VarNode *> written;
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    written.insert(op->buffer->data.get());
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      // tvm_access_ptr(dtype, data, offset, extent, rw_mask)
+      ICHECK_EQ(op->args.size(), 5U);
+      const VarNode *buf = op->args[1].as<VarNode>();
+      ICHECK(buf) << "tvm_access_ptr data argument must be a Var";
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      // Conservative: assume write if flag is non-constant
+      bool maybe_write = !flag || (flag->value & 2);
+      if (maybe_write) {
+        written.insert(buf);
+      }
+    } else if (op->op.same_as(builtin::address_of())) {
+      // address_of(BufferLoad) - conservatively treat as write
+      ICHECK_EQ(op->args.size(), 1U);
+      const auto *load = op->args[0].as<BufferLoadNode>();
+      ICHECK(load) << "address_of argument must be a BufferLoad";
+      written.insert(load->buffer->data.get());
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+};
+
+/*!
+ * \brief Check if an expression reads any written buffer
+ *
+ * Also handles Let-bound variables that are bound to BufferLoad expressions.
+ */
+class WrittenBufferReadChecker : public ExprVisitor {
+public:
+  bool reads_written = false;
+  const std::unordered_set<const VarNode *> &written_vars;
+  const std::unordered_map<const VarNode *, PrimExpr> *let_bindings;
+
+  explicit WrittenBufferReadChecker(
+      const std::unordered_set<const VarNode *> &written,
+      const std::unordered_map<const VarNode *, PrimExpr> *bindings = nullptr)
+      : written_vars(written), let_bindings(bindings) {}
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (written_vars.count(op->buffer->data.get())) {
+      reads_written = true;
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const VarNode *op) final {
+    // Check if this var is Let-bound to a BufferLoad that reads a written
+    // buffer
+    if (let_bindings) {
+      auto it = let_bindings->find(op);
+      if (it != let_bindings->end()) {
+        // Recursively check the bound expression
+        VisitExpr(it->second);
+      }
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      // tvm_access_ptr read
+      ICHECK_EQ(op->args.size(), 5U);
+      const VarNode *buf = op->args[1].as<VarNode>();
+      ICHECK(buf) << "tvm_access_ptr data argument must be a Var";
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      bool maybe_read = !flag || (flag->value & 1);
+      if (maybe_read && written_vars.count(buf)) {
+        reads_written = true;
+      }
+    } else if (op->op.same_as(builtin::address_of())) {
+      // address_of(BufferLoad) counts as reading the buffer
+      ICHECK_EQ(op->args.size(), 1U);
+      const auto *load = op->args[0].as<BufferLoadNode>();
+      ICHECK(load) << "address_of argument must be a BufferLoad";
+      if (written_vars.count(load->buffer->data.get())) {
+        reads_written = true;
+      }
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+};
+
+/*!
+ * \brief Check if an expression contains any CallNode
+ */
+class CallNodeChecker : public ExprVisitor {
+public:
+  bool has_call = false;
+
+  void VisitExpr_(const CallNode *op) final {
+    has_call = true;
+    // No need to continue visiting once we find a call
+  }
+};
+
+/*!
+ * \brief Check if a statement contains any CallNode, excluding matching If
+ * nodes
+ *
+ * Loop unswitching is unsafe when there are function calls OUTSIDE the
+ * hoisted if statement, because those calls (originally executed by all
+ * threads together) would be split into different code paths after
+ * unswitching, potentially breaking synchronization semantics.
+ *
+ * Calls INSIDE the if are safe because they were already conditionally
+ * executed before unswitching.
+ *
+ * Since we replace ALL if statements with matching conditions, we need to
+ * exclude all such if statements when checking for calls.
+ */
+class CallCheckerExcludingIf : public StmtExprVisitor {
+public:
+  bool has_call = false;
+  PrimExpr excluded_condition;
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    // Skip the interior of any if statement with matching condition
+    if (excluded_condition.defined() &&
+        StructuralEqual()(op->condition, excluded_condition)) {
+      return;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    has_call = true;
+    // No need to continue once we find a call
+  }
+};
+
+/*!
+ * \brief Check if condition or any Let-bound variable it uses depends on loop
+ * var
+ */
+bool UsesLoopVarThroughLetBindings(
+    const PrimExpr &cond, const Var &loop_var,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings) {
+  // Check if condition directly uses loop variable
+  if (UsesVar(cond, [&](const VarNode *v) { return v == loop_var.get(); })) {
+    return true;
+  }
+
+  // Check if any Let-bound variable used in condition has a binding that uses
+  // the loop variable
+  if (let_bindings) {
+    bool uses_loop_var = false;
+    PostOrderVisit(cond, [&](const ObjectRef &obj) {
+      if (uses_loop_var)
+        return;
+      if (const auto *var_node = obj.as<VarNode>()) {
+        auto it = let_bindings->find(var_node);
+        if (it != let_bindings->end()) {
+          // Check if the bound expression uses the loop variable
+          if (UsesLoopVarThroughLetBindings(it->second, loop_var,
+                                            let_bindings)) {
+            uses_loop_var = true;
+          }
+        }
+      }
+    });
+    if (uses_loop_var) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*!
+ * \brief Check if an expression uses any variable in \p vars (directly or
+ * through Let bindings).
+ *
+ * This is similar to UsesLoopVarThroughLetBindings, but generalized to a set of
+ * variables. It is used to conservatively block unswitching on per-thread
+ * predicates (e.g. threadIdx.x) because later passes may insert synchronization
+ * calls that would become control-flow dependent after unswitching.
+ */
+bool UsesVarsThroughLetBindingsImpl(
+    const PrimExpr &expr, const std::unordered_set<const VarNode *> &vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings,
+    std::unordered_set<const VarNode *> *visited_let_vars) {
+  if (vars.empty()) {
+    return false;
+  }
+
+  // Direct use in expr
+  if (UsesVar(expr, [&](const VarNode *v) { return vars.count(v); })) {
+    return true;
+  }
+
+  if (!let_bindings) {
+    return false;
+  }
+
+  bool uses = false;
+  PostOrderVisit(expr, [&](const ObjectRef &obj) {
+    if (uses) {
+      return;
+    }
+    const auto *var_node = obj.as<VarNode>();
+    if (!var_node) {
+      return;
+    }
+    auto it = let_bindings->find(var_node);
+    if (it == let_bindings->end()) {
+      return;
+    }
+    if (visited_let_vars && visited_let_vars->count(var_node)) {
+      return;
+    }
+    if (visited_let_vars) {
+      visited_let_vars->insert(var_node);
+    }
+    if (UsesVarsThroughLetBindingsImpl(it->second, vars, let_bindings,
+                                       visited_let_vars)) {
+      uses = true;
+    }
+  });
+
+  return uses;
+}
+
+bool UsesVarsThroughLetBindings(
+    const PrimExpr &expr, const std::unordered_set<const VarNode *> &vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings) {
+  std::unordered_set<const VarNode *> visited_let_vars;
+  return UsesVarsThroughLetBindingsImpl(expr, vars, let_bindings,
+                                        &visited_let_vars);
+}
+
+/*!
+ * \brief Check if a statement is side-effect free (i.e. a no-op), allowing only
+ * pure/read-only expression evaluation.
+ *
+ * This is intentionally conservative, and is used as a profitability/safety
+ * guard: only unswitch when the "else version" of the loop body does not
+ * perform any meaningful work. This keeps the common pattern
+ *
+ *   for i: if cond: S(i)
+ *
+ * while avoiding code-size blowup and control-flow complexity for
+ *
+ *   for i: if cond: S1(i) else: S2(i)
+ *
+ * or when there are other side-effecting statements outside the hoisted if.
+ */
+bool IsSideEffectFreeStmt(const Stmt &stmt) {
+  if (!stmt.defined()) {
+    return true;
+  }
+
+  if (const auto *op = stmt.as<EvaluateNode>()) {
+    // Treat pure or read-only evaluation as no-op.
+    return SideEffect(op->value) <= CallEffectKind::kReadState;
+  }
+
+  if (const auto *op = stmt.as<SeqStmtNode>()) {
+    for (const Stmt &s : op->seq) {
+      if (!IsSideEffectFreeStmt(s)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  if (const auto *op = stmt.as<LetStmtNode>()) {
+    if (SideEffect(op->value) > CallEffectKind::kReadState) {
+      return false;
+    }
+    return IsSideEffectFreeStmt(op->body);
+  }
+
+  if (const auto *op = stmt.as<IfThenElseNode>()) {
+    if (SideEffect(op->condition) > CallEffectKind::kReadState) {
+      return false;
+    }
+    if (!IsSideEffectFreeStmt(op->then_case)) {
+      return false;
+    }
+    if (op->else_case.defined() &&
+        !IsSideEffectFreeStmt(op->else_case.value())) {
+      return false;
+    }
+    return true;
+  }
+
+  if (const auto *op = stmt.as<ForNode>()) {
+    if (SideEffect(op->min) > CallEffectKind::kReadState ||
+        SideEffect(op->extent) > CallEffectKind::kReadState) {
+      return false;
+    }
+    return IsSideEffectFreeStmt(op->body);
+  }
+
+  // Conservatively treat all other statements as side-effecting.
+  return false;
+}
+
+/*!
+ * \brief Check if a condition is loop-invariant
+ */
+bool IsLoopInvariant(
+    const PrimExpr &cond, const Var &loop_var,
+    const std::unordered_set<const VarNode *> &written_vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings = nullptr,
+    const std::unordered_set<const VarNode *> *disallowed_vars = nullptr) {
+  // Check 0: disallow conditions that depend on per-thread binding vars (e.g.
+  // threadIdx.x). These predicates are loop-invariant, but unswitching them can
+  // split the execution into different code paths across threads. Later passes
+  // (e.g. thread sync insertion, fence proxy injection) may add synchronization
+  // calls outside the hoisted if, which would become control-flow dependent and
+  // lead to incorrect codegen.
+  if (disallowed_vars && !disallowed_vars->empty()) {
+    if (UsesVarsThroughLetBindings(cond, *disallowed_vars, let_bindings)) {
+      return false;
+    }
+  }
+
+  // Check 1: must not use loop variable (directly or through Let bindings)
+  if (UsesLoopVarThroughLetBindings(cond, loop_var, let_bindings)) {
+    return false;
+  }
+
+  // Check 2: must not read written buffers (including through Let bindings)
+  WrittenBufferReadChecker checker(written_vars, let_bindings);
+  checker(cond);
+  if (checker.reads_written) {
+    return false;
+  }
+
+  // Check 3: conservatively reject if condition contains any call node
+  // (calls may have side effects or depend on loop-variant state)
+  CallNodeChecker call_checker;
+  call_checker(cond);
+  return !call_checker.has_call;
+}
+
+/*!
+ * \brief Replace if nodes with matching condition with their then/else branch
+ *
+ * When hoisting a condition out of a loop, we need to replace ALL if statements
+ * with the same condition, not just the first one found. This ensures that
+ * in the then-branch all matching conditions are replaced with their then-case,
+ * and in the else-branch all matching conditions are replaced with their
+ * else-case.
+ *
+ * Also removes LetStmts for variables that have been hoisted, since they are
+ * now redundant (the variable is already bound outside the loop).
+ */
+class IfBranchReplacer : public StmtExprMutator {
+public:
+  PrimExpr hoisted_condition;
+  bool take_then;
+  std::unordered_set<const VarNode *> hoisted_vars;
+
+  IfBranchReplacer(
+      const PrimExpr &condition, bool take_then,
+      const std::vector<std::pair<Var, PrimExpr>> &hoisted_let_bindings)
+      : hoisted_condition(condition), take_then(take_then) {
+    for (const auto &binding : hoisted_let_bindings) {
+      hoisted_vars.insert(binding.first.get());
+    }
+  }
+
+  Stmt VisitStmt_(const IfThenElseNode *op) final {
+    // Replace if the condition is structurally equal to the hoisted condition
+    if (StructuralEqual()(op->condition, hoisted_condition)) {
+      if (take_then) {
+        return VisitStmt(op->then_case);
+      } else {
+        return op->else_case.defined() ? VisitStmt(op->else_case.value())
+                                       : Evaluate(0);
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const LetStmtNode *op) final {
+    // Remove LetStmts for hoisted variables (they are now bound outside the
+    // loop)
+    if (hoisted_vars.count(op->var.get())) {
+      return VisitStmt(op->body);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+};
+
+/*!
+ * \brief Collect Let-bound variables used in an expression
+ */
+class LetVarCollector : public ExprVisitor {
+public:
+  std::vector<std::pair<Var, PrimExpr>> used_let_bindings;
+  const std::unordered_map<const VarNode *, PrimExpr> &let_bindings;
+  std::unordered_set<const VarNode *> visited;
+
+  explicit LetVarCollector(
+      const std::unordered_map<const VarNode *, PrimExpr> &bindings)
+      : let_bindings(bindings) {}
+
+  void VisitExpr_(const VarNode *op) final {
+    if (visited.count(op))
+      return;
+    auto it = let_bindings.find(op);
+    if (it != let_bindings.end()) {
+      visited.insert(op);
+      // First recursively collect Let-bound vars used in this binding's value
+      VisitExpr(it->second);
+      // Then add this binding (so dependencies come first)
+      used_let_bindings.push_back(
+          std::make_pair(ffi::GetRef<Var>(op), it->second));
+    }
+  }
+};
+
+/*!
+ * \brief Find first hoistable if (not descending into nested loops)
+ *
+ * Also tracks Let bindings where variables are bound to BufferLoad expressions.
+ */
+class HoistableIfFinder : public StmtVisitor {
+public:
+  const IfThenElseNode *found = nullptr;
+  const Var &loop_var;
+  const std::unordered_set<const VarNode *> &written_vars;
+  const std::unordered_set<const VarNode *> *disallowed_vars;
+  std::unordered_map<const VarNode *, PrimExpr> let_bindings_;
+  // Let bindings that need to be hoisted with the condition
+  std::vector<std::pair<Var, PrimExpr>> hoisted_let_bindings;
+
+  HoistableIfFinder(const Var &loop_var,
+                    const std::unordered_set<const VarNode *> &written_vars,
+                    const std::unordered_set<const VarNode *> *disallowed_vars)
+      : loop_var(loop_var), written_vars(written_vars),
+        disallowed_vars(disallowed_vars) {}
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Track ALL Let bindings to detect when a condition uses a variable
+    // that is defined inside the loop with a loop-variant value.
+    // This is necessary because variables like i_s may be bound to expressions
+    // containing the loop variable (e.g., if_then_else(...k...)), and
+    // conditions using such variables should not be hoisted.
+    let_bindings_[op->var.get()] = op->value;
+    StmtVisitor::VisitStmt_(op);
+    // Remove the binding when leaving scope
+    let_bindings_.erase(op->var.get());
+  }
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    if (found)
+      return;
+    if (IsLoopInvariant(op->condition, loop_var, written_vars, &let_bindings_,
+                        disallowed_vars)) {
+      found = op;
+      // Collect Let-bound variables used in the condition
+      LetVarCollector collector(let_bindings_);
+      collector(op->condition);
+      hoisted_let_bindings = std::move(collector.used_let_bindings);
+      return;
+    }
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const ForNode *) final {
+    // Don't descend into nested loops
+  }
+};
+
+/*!
+ * \brief Main pass: Loop Unswitching
+ */
+class LoopUnswitcher : public StmtExprMutator {
+public:
+  explicit LoopUnswitcher(bool allow_non_trivial_else)
+      : allow_non_trivial_else_(allow_non_trivial_else) {}
+
+  std::unordered_set<const VarNode *> thread_idx_vars_in_scope_;
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    bool pushed_thread_idx = false;
+    if (op->thread_binding.defined()) {
+      String thread_tag = op->thread_binding.value()->thread_tag;
+      if (thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+          thread_tag == "threadIdx.z") {
+        thread_idx_vars_in_scope_.insert(op->loop_var.get());
+        pushed_thread_idx = true;
+      }
+    }
+
+    // Bottom-up: process nested structures first
+    Stmt body = VisitStmt(op->body);
+
+    // Collect written buffer vars
+    WrittenVarCollector collector;
+    collector(body);
+
+    // Find hoistable if
+    HoistableIfFinder finder(op->loop_var, collector.written,
+                             &thread_idx_vars_in_scope_);
+    finder(body);
+
+    Stmt result;
+    if (!finder.found) {
+      if (body.same_as(op->body)) {
+        result = ffi::GetRef<Stmt>(op);
+      } else {
+        result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                     op->thread_binding, op->annotations);
+      }
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Check if there are any function calls OUTSIDE the hoisted if statement.
+    // Calls outside the if are executed by all threads together; unswitching
+    // would split them into different code paths, breaking synchronization.
+    // Calls inside the if are already conditionally executed, so they're safe.
+    CallCheckerExcludingIf call_checker;
+    call_checker.excluded_condition = finder.found->condition;
+    call_checker(body);
+    if (call_checker.has_call) {
+      if (body.same_as(op->body)) {
+        result = ffi::GetRef<Stmt>(op);
+      } else {
+        result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                     op->thread_binding, op->annotations);
+      }
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Unswitch: create two loop versions
+    const IfThenElseNode *if_node = finder.found;
+    PrimExpr hoisted_condition = if_node->condition;
+
+    Stmt then_body = IfBranchReplacer(hoisted_condition, true,
+                                      finder.hoisted_let_bindings)(body);
+    Stmt else_body = IfBranchReplacer(hoisted_condition, false,
+                                      finder.hoisted_let_bindings)(body);
+
+    // Only unswitch when the else-version does not do any meaningful work.
+    // This keeps the canonical optimization `for: if(cond) {S}` ->
+    // `if(cond){for:S}` while avoiding duplicating non-trivial loop bodies into
+    // two versions.
+    if (!allow_non_trivial_else_ && !IsSideEffectFreeStmt(else_body)) {
+      result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                   op->thread_binding, op->annotations);
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Create new loop_var for else_loop to maintain SSA form
+    Var else_loop_var(op->loop_var->name_hint, op->loop_var->dtype);
+    else_body = Substitute(else_body, {{op->loop_var, else_loop_var}});
+
+    For then_loop(op->loop_var, op->min, op->extent, op->kind, then_body,
+                  op->thread_binding, op->annotations);
+    For else_loop(else_loop_var, op->min, op->extent, op->kind, else_body,
+                  op->thread_binding, op->annotations);
+
+    result = IfThenElse(if_node->condition, then_loop, else_loop);
+
+    // Wrap with hoisted Let bindings (in reverse order so first binding is
+    // outermost)
+    for (auto it = finder.hoisted_let_bindings.rbegin();
+         it != finder.hoisted_let_bindings.rend(); ++it) {
+      result = LetStmt(it->first, it->second, result);
+    }
+
+    if (pushed_thread_idx) {
+      thread_idx_vars_in_scope_.erase(op->loop_var.get());
+    }
+    return result;
+  }
+
+private:
+  bool allow_non_trivial_else_{false};
+};
+
+// --- Public API ---
+
+Stmt ApplyLoopUnswitching(Stmt stmt, bool allow_non_trivial_else) {
+  return LoopUnswitcher(allow_non_trivial_else)(std::move(stmt));
+}
+
+using namespace tir::transform;
+
+tvm::transform::Pass LoopUnswitching() {
+  auto pass_func = [](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    bool disable_loop_unswitching =
+        ctx->GetConfig<Bool>(kDisableLoopUnswitching, Bool(false)).value();
+    if (disable_loop_unswitching) {
+      return f;
+    }
+    bool allow_non_trivial_else =
+        ctx->GetConfig<Bool>(kLoopUnswitchingAllowNonTrivialElse, Bool(false))
+            .value();
+    f.CopyOnWrite()->body =
+        ApplyLoopUnswitching(f->body, allow_non_trivial_else);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LoopUnswitching", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LoopUnswitching", LoopUnswitching);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 45283d905..222a6e79a 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -23,176 +23,611 @@
  */
 
 #include "loop_vectorize.h"
+#include "../config.h"
 #include "../op/builtin.h"
+#include "../op/utils.h"
 #include "../target/utils.h"
 #include "arith/int_operator.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include "common/loop_vectorization_utils.h"
 #include "tvm/tir/analysis.h"
 #include "tvm/tir/var.h"
+#include <iostream>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
+#include <vector>
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+/*!
+ * \brief Check if buffer strides represent a contiguous (row-major) layout.
+ * \param buffer The buffer to check.
+ * \param analyzer The analyzer for symbolic comparison.
+ * \return True if strides are empty (implicitly contiguous) or match row-major
+ * layout.
+ */
+bool IsBufferContiguous(const Buffer &buffer, arith::Analyzer *analyzer) {
+  if (buffer->strides.empty()) {
+    return true;
+  }
+  if (buffer->strides.size() != buffer->shape.size()) {
+    return false;
+  }
+  // For row-major layout:
+  // strides[n-1] = 1
+  // strides[i] = strides[i+1] * shape[i+1]
+  int n = buffer->shape.size();
+  PrimExpr expected_stride = make_const(buffer->shape[0].dtype(), 1);
+  for (int i = n - 1; i >= 0; --i) {
+    if (!analyzer->CanProveEqual(buffer->strides[i], expected_stride)) {
+      return false;
+    }
+    if (i > 0) {
+      expected_stride = expected_stride * buffer->shape[i];
+    }
+  }
+  return true;
+}
+
 struct VectorizePlanResult {
   int vector_size;
   bool dynamic;
   PrimExpr condition;
 };
 
-class VectorizeFindGlobalAccess : public arith::IRVisitorWithAnalyzer {
+struct BufferVectorInfo {
+  Buffer buffer;
+  int vector_size;
+  bool is_store;
+  Array<PrimExpr> indices;
+};
+
+Array<PrimExpr> GetBufferStrides(const Buffer &buffer) {
+  if (!buffer->strides.empty()) {
+    return buffer->strides;
+  }
+  Array<PrimExpr> strides;
+  PrimExpr stride = 1;
+  for (int i = buffer->shape.size() - 1; i >= 0; --i) {
+    strides.push_back(stride);
+    stride = stride * buffer->shape[i];
+  }
+  return Array<PrimExpr>{strides.rbegin(), strides.rend()};
+}
+
+class VectorizeFindMemoryAccess : public StmtExprVisitor {
 public:
-  VectorizeFindGlobalAccess() = default;
+  VectorizeFindMemoryAccess() = default;
 
   bool HasGlobalAccess(const Stmt &stmt) {
     this->operator()(stmt);
     return has_global_access_;
   }
 
+  bool HasSharedAccess(const Stmt &stmt) {
+    this->operator()(stmt);
+    return has_shared_access_;
+  }
+
+  static bool MaySupportVectorize256(const Stmt &stmt) {
+    VectorizeFindMemoryAccess visitor;
+    visitor(stmt);
+    return visitor.has_global_access_ && !visitor.has_shared_access_;
+  }
+
 private:
   bool has_global_access_ = false;
+  bool has_shared_access_ = false;
 
   void VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "global")
+    if (IsGlobalBuffer(node->buffer))
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    if (IsSharedBuffer(node->buffer))
+      has_shared_access_ = true;
+    return StmtExprVisitor::VisitStmt_(node);
   }
 
   void VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "global")
+    if (IsGlobalBuffer(node->buffer))
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    if (IsSharedBuffer(node->buffer))
+      has_shared_access_ = true;
+    return StmtExprVisitor::VisitExpr_(node);
   }
 };
 
-class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
+/*!
+ * \brief Check if a For loop body contains SeqStmt (multiple statements).
+ *
+ * When the For body has SeqStmt, the vectorization analysis is more complex
+ * and we should be conservative - treating local buffers the same as memory
+ * buffers instead of ignoring their constraints.
+ *
+ * Currently we only handle simple single BufferStore cases specially for
+ * local buffer optimization.
+ */
+bool ForBodyContainsSeqStmt(const For &loop) {
+  bool has_seq_stmt = false;
+  PostOrderVisit(loop->body, [&](const ObjectRef &obj) {
+    if (obj.as<SeqStmtNode>()) {
+      has_seq_stmt = true;
+    }
+  });
+  return has_seq_stmt;
+}
+
+class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
 public:
-  VectorizePlanner() = default;
+  explicit VectorizePlanner(arith::Analyzer *analyzer,
+                            const LayoutMap &layout_map = {})
+      : arith::IRMutatorWithAnalyzer(analyzer), layout_map_(layout_map) {}
 
   int Plan(const For &node) {
-    tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-    Optional<Bool> opt_disable_vectorize_256 =
-        ctxt->GetConfig(kDisableVectorize256, Optional<Bool>());
-    bool disable_vectorize_256 =
-        opt_disable_vectorize_256.value_or(Bool(false));
-    if (tvm::tl::TargetIsSm100(Target::Current(false)) &&
+    bool disable_vectorize_256 = tl_config::Vectorize256Disabled();
+    bool verbose = tl_config::VectorizePlannerVerboseEnabled();
+
+    if (TargetSupportVectorize256(Target::Current(false)) &&
         !disable_vectorize_256 &&
-        VectorizeFindGlobalAccess().HasGlobalAccess(node)) {
-      vector_load_bits_max_ = vector_size_ = 256;
+        VectorizeFindMemoryAccess::MaySupportVectorize256(node)) {
+      vector_load_bits_max_ = initial_vector_size_ = loop_extent_vector_size_ =
+          256;
     } else {
-      vector_load_bits_max_ = vector_size_ = 128;
+      vector_load_bits_max_ = initial_vector_size_ = loop_extent_vector_size_ =
+          128;
     }
+
+    // Check if For body contains SeqStmt (multiple statements).
+    // When there's SeqStmt, we use conservative strategy - treating local
+    // buffers the same as memory buffers. The special local buffer optimization
+    // (ignoring local buffer constraints) only applies to simple single
+    // BufferStore cases.
+    bool has_seq_stmt = ForBodyContainsSeqStmt(node);
+
+    // Clear previous buffer info and collect new ones
+    buffer_vector_infos_.clear();
     this->operator()(node);
+
+    // Compute final vector size from collected buffer infos
+    // Strategy:
+    // - If For body contains SeqStmt: take min of all buffers (conservative)
+    // - Else if all buffers are local/fragment: take min of all
+    // - Else if there are global/shared buffers: ignore local/fragment
+    //   constraints and only take min of global/shared buffers
+    // Rationale: local/fragment are register-level, no memory alignment
+    // constraints. But for complex cases (SeqStmt), we stay conservative.
+    vector_size_ = initial_vector_size_;
+
+    if (verbose) {
+      std::cerr << "=== VectorizePlanner: Collected buffer vector sizes ==="
+                << "\n";
+      std::cerr << "  initial_vector_size=" << initial_vector_size_
+                << ", loop_extent_vector_size=" << loop_extent_vector_size_
+                << ", has_seq_stmt=" << (has_seq_stmt ? "true" : "false")
+                << "\n";
+    }
+
+    // Separate buffers into local/fragment vs memory (global/shared) vs
+    // call/cast
+    int local_fragment_min = initial_vector_size_;
+    int memory_min = initial_vector_size_;
+    int call_node_min = initial_vector_size_;
+    bool has_global_or_shared_buffer = false;
+
+    auto is_local_or_fragment = [](const Buffer &buf) {
+      return IsLocalBuffer(buf, /*allow_var=*/true) || IsFragmentBuffer(buf);
+    };
+
+    std::vector<BufferVectorInfo> local_fragment_buffers;
+
+    for (const auto &info : buffer_vector_infos_) {
+      auto buffer = info.buffer;
+      if (verbose) {
+        if (buffer.defined()) {
+          std::cerr << "  Buffer: " << buffer->name
+                    << " (scope=" << buffer.scope() << ")"
+                    << " -> vector_size=" << info.vector_size
+                    << (info.is_store ? " [store]" : " [load]") << "\n";
+        } else {
+          std::cerr << "  [cast/call] -> vector_size=" << info.vector_size
+                    << "\n";
+        }
+      }
+      if (!buffer.defined()) {
+        // CastNode, CallNode do not have buffer defined.
+        call_node_min = arith::ZeroAwareGCD(call_node_min, info.vector_size);
+      } else if (is_local_or_fragment(buffer)) {
+        local_fragment_min =
+            arith::ZeroAwareGCD(local_fragment_min, info.vector_size);
+        local_fragment_buffers.push_back(info);
+      } else {
+        // global, shared, shared.dyn
+        memory_min = arith::ZeroAwareGCD(memory_min, info.vector_size);
+        has_global_or_shared_buffer = true;
+      }
+    }
+
+    if (verbose) {
+      std::cerr << "  Computed mins: local_fragment_min=" << local_fragment_min
+                << ", memory_min=" << memory_min
+                << ", call_node_min=" << call_node_min << "\n";
+    }
+
+    if (has_seq_stmt) {
+      // For body contains SeqStmt (multiple statements).
+      // Use conservative strategy: take GCD of all buffers including local.
+      // The special local buffer optimization only applies to simple single
+      // BufferStore cases where we can be confident about the access pattern.
+      vector_size_ = arith::ZeroAwareGCD(
+          arith::ZeroAwareGCD(local_fragment_min, memory_min), call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Has SeqStmt, using conservative GCD of all"
+                  << " -> vector_size=" << vector_size_ << "\n";
+      }
+    } else if (has_global_or_shared_buffer) {
+      // Has memory buffers and simple case (no SeqStmt):
+      // ignore local/fragment constraints
+      vector_size_ = arith::ZeroAwareGCD(memory_min, call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Has memory buffers (simple case), using "
+                     "memory_min="
+                  << memory_min
+                  << " (ignoring local/fragment_min=" << local_fragment_min
+                  << ")" << "\n";
+      }
+      // vector_size may be greater than local/fragment buffers' vector_size.
+      // In such case, we need to re-validate if the indices are vectorizable
+      // at the new vector_size boundary. If not, take GCD.
+      for (const auto &info : local_fragment_buffers) {
+        if (vector_size_ > info.vector_size && !info.indices.empty()) {
+          // Compute elem_offset from indices and strides
+          Array<PrimExpr> strides = GetBufferStrides(info.buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < info.indices.size(); ++i) {
+            elem_offset += info.indices[i] * strides[i];
+          }
+          if (!IndicesCanVectorize(elem_offset, inner_for_->loop_var,
+                                   inner_for_->extent, vector_size_,
+                                   analyzer_)) {
+            // Not invariant at this vector_size, need to take GCD
+            int old_vector_size = vector_size_;
+            vector_size_ = arith::ZeroAwareGCD(vector_size_, info.vector_size);
+            if (verbose) {
+              std::cerr << "  [Re-validate] Local buffer '" << info.buffer->name
+                        << "' not invariant at vector_size=" << old_vector_size
+                        << ", GCD with " << info.vector_size
+                        << " -> vector_size=" << vector_size_ << "\n";
+            }
+          }
+        }
+      }
+    } else {
+      // Only local/fragment buffers: use GCD of local_fragment_min and
+      // call_node_min
+      vector_size_ = arith::ZeroAwareGCD(local_fragment_min, call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Only local/fragment buffers, using "
+                     "GCD(local_fragment_min, call_node_min)="
+                  << vector_size_ << "\n";
+      }
+    }
+
+    // GCD with loop extent to ensure vector_size divides the loop extent
+    vector_size_ = arith::ZeroAwareGCD(loop_extent_vector_size_, vector_size_);
+
+    if (verbose) {
+      std::cerr << "=== Final vector_size: " << vector_size_ << " ===" << "\n";
+    }
     return vector_size_;
   }
 
 private:
-  void VisitStmt_(const ForNode *node) final {
+  Stmt VisitStmt_(const ForNode *node) final {
     inner_for_ = node;
-    auto extent_ptr = as_const_int(analyzer_.Simplify(node->extent));
-    // Here I disable dynamic shape completely,
-    //   In order to do it, the Planner should accept an analyzer with
-    //   arithmetic info outside to prove the dividiblity of vector size
-    if (!extent_ptr) {
-      vector_size_ = 1;
-      return;
+    bool contains_nested_for = false;
+    // Must analysis vectorization on the innermost loop
+    PostOrderVisit(Downcast<Stmt>(node->body), [&](const ObjectRef &obj) {
+      if (obj.as<ForNode>()) {
+        contains_nested_for = true;
+      }
+    });
+
+    if (!contains_nested_for) {
+      auto extent_ptr = as_const_int(analyzer_->Simplify(node->extent));
+      // Here I disable dynamic shape completely,
+      //   In order to do it, the Planner should accept an analyzer with
+      //   arithmetic info outside to prove the dividiblity of vector size
+      // Note(lei): This is somehow make sense because we should assume the
+      // tiling size is always static.
+      if (!extent_ptr) {
+        loop_extent_vector_size_ = 1;
+        return ffi::GetRef<Stmt>(node);
+      }
+      loop_extent_vector_size_ =
+          arith::ZeroAwareGCD(initial_vector_size_, *extent_ptr);
     }
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
+  PrimExpr VisitExpr_(const BufferLoadNode *node) final {
+    if (IsSharedBuffer(node->buffer) || IsGlobalBuffer(node->buffer))
       has_nonlocal_memory_access_ = true;
     if (node->buffer->shape.size() == 1) {
       // TODO(lei): This should be improved as
       // constant buffer that tl hack to use as local register.
       auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
       if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
       }
     }
-    UpdateVectorSize(node->indices, node->buffer);
+    UpdateVectorSize(node->indices, node->buffer, false);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
+  Stmt VisitStmt_(const BufferStoreNode *node) final {
+    if (IsSharedBuffer(node->buffer) || IsGlobalBuffer(node->buffer))
       has_nonlocal_memory_access_ = true;
-    UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr(node->value);
+    UpdateVectorSize(node->indices, node->buffer, true);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitStmt_(const IfThenElseNode *node) final {
+  Stmt VisitStmt_(const IfThenElseNode *node) final {
     CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const CallNode *node) final {
+  PrimExpr VisitExpr_(const CallNode *node) final {
     if (node->op == builtin::if_then_else()) {
       CheckConditionVectorized(node->args[0]);
-    } else if (node->op == builtin::call_extern()) {
-      // do not vectorize extern calls
-      vector_size_ = 1;
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op == tl::atomic_add_elem_op()) {
+      // Assert at least 2 args (dst_ptr and src)
+      ICHECK(node->args.size() >= 2)
+          << "atomic_add_elem_op requires at least 2 args (dst and src)";
+
+      // Get dst dtype from args[0] (address_of call containing BufferLoad)
+      auto address_of_call = node->args[0].as<CallNode>();
+      ICHECK(address_of_call && address_of_call->op == builtin::address_of())
+          << "atomic_add_elem_op first arg must be address_of call";
+
+      auto buffer_load = address_of_call->args[0].as<BufferLoadNode>();
+      ICHECK(buffer_load) << "address_of arg must be BufferLoad";
+
+      DataType dtype = buffer_load->buffer->dtype;
+      int vectorize_length = 1;
+      if (dtype.is_float16() || dtype.is_bfloat16()) {
+        vectorize_length = 2;
+      } else if (dtype.is_float() && dtype.bits() == 32 &&
+                 TargetHasSMVersionGE(Target::Current(false), 90)) {
+        vectorize_length = 4;
+      }
+
+      buffer_vector_infos_.push_back({Buffer(), vectorize_length, false, {}});
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op == builtin::address_of()) {
+      // address_of have buffer load value so we should analysis the buffer load
+      // node to update vector_size_.
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    }
+
+    // vectorizable property
+    OpAttrMap<TVectorizable> op_vectorizable_ =
+        Op::GetAttrMap<TVectorizable>("TVectorizable");
+
+    auto optional_op = node->op.as<Op>();
+    bool vectorizable = op_vectorizable_.get(optional_op.value(), false) &&
+                        !node->dtype.is_scalable_vector();
+    if (vectorizable) {
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    }
+
+    // For other call nodes, use PostOrderVisit to check buffer accesses
+    // and determine if the given vector size is invariant
+    auto check_buffer_access_invariant = [&](int target_vec_size) -> bool {
+      if (!inner_for_)
+        return true;
+      bool all_invariant = true;
+      PostOrderVisit(ffi::GetRef<PrimExpr>(node), [&](const ObjectRef &obj) {
+        if (!all_invariant)
+          return;
+        if (auto *load = obj.as<BufferLoadNode>()) {
+          auto transformed_indices =
+              TransformIndices(load->indices, load->buffer);
+          Array<PrimExpr> strides = GetBufferStrides(load->buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < transformed_indices.size(); ++i) {
+            elem_offset += transformed_indices[i] * strides[i];
+          }
+          if (!IsExprInvariantInVectorBoundary(elem_offset,
+                                               inner_for_->loop_var,
+                                               target_vec_size, analyzer_)) {
+            all_invariant = false;
+          }
+        } else if (auto *store = obj.as<BufferStoreNode>()) {
+          auto transformed_indices =
+              TransformIndices(store->indices, store->buffer);
+          Array<PrimExpr> strides = GetBufferStrides(store->buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < transformed_indices.size(); ++i) {
+            elem_offset += transformed_indices[i] * strides[i];
+          }
+          if (!IsExprInvariantInVectorBoundary(elem_offset,
+                                               inner_for_->loop_var,
+                                               target_vec_size, analyzer_)) {
+            all_invariant = false;
+          }
+        }
+      });
+      return all_invariant;
+    };
+    // Find the largest vector size where all buffer accesses are invariant
+    int call_node_vector_size = loop_extent_vector_size_;
+    while (call_node_vector_size > 1) {
+      if (check_buffer_access_invariant(call_node_vector_size)) {
+        break;
+      }
+      call_node_vector_size /= 2;
     }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    buffer_vector_infos_.push_back(
+        {Buffer(), call_node_vector_size, false, {}});
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void CheckConditionVectorized(const PrimExpr &cond) {
     // TODO: perform some checks here
   }
 
-  void VisitExpr_(const CastNode *node) final {
-    vector_size_ = arith::ZeroAwareGCD(
-        vector_load_bits_max_ / node->dtype.bits(), vector_size_);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+  Array<PrimExpr> TransformIndices(const Array<PrimExpr> &indices,
+                                   const Buffer &buffer) {
+    auto transformed_indices = indices;
+    if (layout_map_.defined() && layout_map_.count(buffer)) {
+      ICHECK(IsBufferContiguous(buffer, analyzer_))
+          << buffer
+          << " has non-contiguous strides, but layout map is provided.";
+      // forward indices
+      auto layout = layout_map_[buffer];
+      transformed_indices = layout->Forward(indices);
+      // Reshape transformed_indices to match buffer->shape dimensions if needed
+      if (transformed_indices.size() != buffer->shape.size()) {
+        // Step 1: Compute linear offset using layout->OutputShape()
+        auto output_shape = layout->OutputShape();
+        ICHECK_EQ(transformed_indices.size(), output_shape.size())
+            << "Forward indices size " << transformed_indices.size()
+            << " != OutputShape size " << output_shape.size();
+        PrimExpr linear_offset = 0;
+        PrimExpr stride = 1;
+        for (int i = output_shape.size() - 1; i >= 0; --i) {
+          linear_offset = linear_offset + transformed_indices[i] * stride;
+          stride = stride * output_shape[i];
+        }
+        // Step 2: Decompose linear_offset into buffer->shape dimensions
+        Array<PrimExpr> new_indices;
+        for (int i = buffer->shape.size() - 1; i >= 0; --i) {
+          new_indices.push_back(FloorMod(linear_offset, buffer->shape[i]));
+          linear_offset = FloorDiv(linear_offset, buffer->shape[i]);
+        }
+        transformed_indices =
+            Array<PrimExpr>{new_indices.rbegin(), new_indices.rend()};
+      }
+    }
+    return transformed_indices;
+  }
+
+  PrimExpr VisitExpr_(const CastNode *node) final {
+    int cast_vector_size = arith::ZeroAwareGCD(
+        vector_load_bits_max_ / node->dtype.bits(), initial_vector_size_);
+    // Record cast constraint (use empty buffer to indicate cast)
+    buffer_vector_infos_.push_back({Buffer(), cast_vector_size, false, {}});
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
+  int ComputeBufferVectorSize(const Array<PrimExpr> &indices,
+                              const Buffer &buffer, bool is_store) {
     if (!inner_for_)
-      return;
+      return initial_vector_size_;
+
+    int buffer_vec_size = loop_extent_vector_size_;
+
+    // Transform indices using layout_map if present
+    auto transformed_indices = TransformIndices(indices, buffer);
+
     // 1. Compute raw element offset
-    auto strides = buffer->strides;
-    if (buffer->strides.empty()) {
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        strides.push_back(stride);
-        stride = stride * buffer->shape[i];
+    Array<PrimExpr> strides = GetBufferStrides(buffer);
+
+    PrimExpr elem_offset = 0;
+    for (size_t i = 0; i < transformed_indices.size(); ++i) {
+      elem_offset += transformed_indices[i] * strides[i];
+    }
+
+    // 2. Check if current buffer_vec_size works with invariant boundary check
+    // In some cases, buffer_vec_size is max (e.g. 128), but
+    // IsExprInvariantInVectorBoundary may only be true at a smaller size (e.g.
+    // 64). Recursively halve buffer_vec_size until we find a size where
+    // is_invariant is true. Fallback: minimum vector size based on buffer dtype
+    int min_vec_size = arith::ZeroAwareGCD(
+        buffer_vec_size,
+        vector_load_bits_max_ / (buffer->dtype.bits() * buffer->dtype.lanes()));
+    bool is_invariant = false;
+    int try_vec_size = buffer_vec_size;
+    while (try_vec_size >= min_vec_size) {
+      is_invariant = IsExprInvariantInVectorBoundary(
+          elem_offset, inner_for_->loop_var, try_vec_size, analyzer_);
+      if (is_invariant) {
+        buffer_vec_size = try_vec_size;
+        break;
       }
-      strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
+      try_vec_size /= 2;
     }
-    PrimExpr elem_offset = 0;
-    for (int i = 0; i < indices.size(); ++i) {
-      elem_offset += indices[i] * strides[i];
+    // If is_invariant is still false, use the fallback min_vec_size
+    if (!is_invariant) {
+      buffer_vec_size = min_vec_size;
     }
 
-    // 2. If element offset is independent with loop_var, ignore it
-    if (CanProveIndependent(elem_offset, inner_for_->loop_var, &analyzer_)) {
-      return;
+    // 3. If element offset is independent with loop_var, ignore it.
+    bool is_independent =
+        CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_);
+    // For BufferStore, if indices is invariant or independent with loop_var,
+    // we should not vectorize it (broadcasting store is not supported).
+    if (is_store && (is_invariant || is_independent)) {
+      return 1;
+    }
+    if (is_independent) {
+      return buffer_vec_size; // only limited constraint from this buffer
     }
+    // 4. Try to find max vectorize size for this buffer
+    while (buffer_vec_size > 1 &&
+           !IndicesCanVectorize(elem_offset, inner_for_->loop_var,
+                                inner_for_->extent, buffer_vec_size,
+                                analyzer_)) {
+      buffer_vec_size /= 2;
+    }
+    return buffer_vec_size;
+  }
 
-    // 3. Tight vectorize bound
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ /
-                                                         buffer->dtype.bits());
+  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer,
+                        bool is_store) {
+    int buffer_vec_size = ComputeBufferVectorSize(indices, buffer, is_store);
+    buffer_vector_infos_.push_back(
+        {buffer, buffer_vec_size, is_store, indices});
+  }
 
-    // 4. Try to vectorize buffer load
-    while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
-      vector_size_ /= 2;
+  // NOTE(wt): The base class IRMutatorWithAnalyzer::VisitStmt_(LetStmtNode*)
+  // binds let variables, but this causes issues when the same variable name
+  // appears multiple times with different values (e.g., in pipelined loops
+  // where the body is duplicated). For this case, we allow the analyzer to
+  // override the binding. Check the impl of
+  // IRMutatorWithAnalyzer::VisitStmt_(LetStmtNode*) in:
+  // tvm/src/arith/ir_mutator_with_analyzer.cc
+  Stmt VisitStmt_(const LetStmtNode *op) final {
+    PrimExpr value = this->VisitExpr(op->value);
+    if (SideEffect(value) <= CallEffectKind::kPure) {
+      // Allow override to handle duplicated loop bodies in pipelined loops
+      analyzer_->Bind(op->var, value, /*allow_override=*/true);
+    }
+    // Continue visiting the body to collect vectorization info
+    Stmt body = this->VisitStmt(op->body);
+    if (value.same_as(op->value) && body.same_as(op->body)) {
+      return ffi::GetRef<Stmt>(op);
+    } else {
+      auto n = this->CopyOnWrite(op);
+      n->value = std::move(value);
+      n->body = std::move(body);
+      return Stmt(n);
     }
   }
 
   int vector_load_bits_max_;
+  int initial_vector_size_ = 128;
+  int loop_extent_vector_size_ = 128;
 
   const ForNode *inner_for_{};
   bool has_nonlocal_memory_access_ = false;
   int vector_size_ = 128;
+  std::vector<BufferVectorInfo> buffer_vector_infos_;
+  LayoutMap layout_map_;
 };
 
 class VectorizeRewriter : public StmtExprMutator {
@@ -210,7 +645,8 @@ class VectorizeRewriter : public StmtExprMutator {
       ICHECK(extent_ptr) << fnode->extent;
       int extent = *extent_ptr;
       ICHECK(extent % vector_size_ == 0)
-          << "extent: " << extent << " vector_size_: " << vector_size_;
+          << "extent: " << extent << " vector_size_: " << vector_size_
+          << " for loop: " << fnode;
       ICHECK(is_zero(fnode->min));
       if (extent == vector_size_) {
         fnode.CopyOnWrite()->kind = ForKind::kVectorized;
@@ -223,7 +659,8 @@ class VectorizeRewriter : public StmtExprMutator {
         Stmt body = Substitute(fnode->body, vmap);
         body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body);
         body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
         return body;
       }
     } else {
@@ -235,7 +672,15 @@ class VectorizeRewriter : public StmtExprMutator {
   const int vector_size_;
 };
 
-int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); }
+int GetVectorizeSize(const For &loop, const LayoutMap &layout_map) {
+  arith::Analyzer analyzer;
+  return VectorizePlanner(&analyzer, layout_map).Plan(loop);
+}
+
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer,
+                     const LayoutMap &layout_map) {
+  return VectorizePlanner(analyzer, layout_map).Plan(loop);
+}
 
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer) {
@@ -255,9 +700,32 @@ bool CanProveIndependent(const PrimExpr &expr, Var var,
   return false;
 }
 
-bool IndiceCanVectorize(const PrimExpr &expr, Var var,
-                        const PrimExpr &iter_var_size,
-                        int target_vectorized_size, arith::Analyzer *analyzer) {
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer) {
+  // Check if expr is invariant within vector boundaries
+  // We're trying to prove the access expression A[f(var)] depends only on
+  // floor(var/vecsize), not on var%vecsize
+  // Mathematically:
+  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
+  // f(floor(var/vecsize)*vecsize + 0)
+  // Example: for i in T.vectorized(8):
+  //     A[i] = B[i] * C[i//4]
+  // if vecsize=4, f(i)=i//4 depends only on i//4
+  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
+  PrimExpr var_aligned =
+      floordiv(var, target_vectorized_size) * target_vectorized_size;
+  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
+  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+    return true;
+  }
+  return false;
+}
+
+bool IndicesCanVectorize(const PrimExpr &expr, Var var,
+                         const PrimExpr &iter_var_size,
+                         int target_vectorized_size,
+                         arith::Analyzer *analyzer) {
   ICHECK(target_vectorized_size >= 1);
   if (target_vectorized_size == 1)
     return true;
@@ -275,9 +743,15 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                                0))
     return false;
 
+  if (IsExprInvariantInVectorBoundary(expr, var, target_vectorized_size,
+                                      analyzer)) {
+    return true;
+  }
+
+  auto simplified_expr = analyzer->Simplify(Substitute(expr, {{var, zero}}));
   // The base offset must be divisible
-  if (!analyzer->CanProveEqual(
-          FloorMod(Substitute(expr, {{var, zero}}), target_size_for_expr), 0)) {
+  if (!analyzer->CanProveEqual(FloorMod(simplified_expr, target_size_for_expr),
+                               zero)) {
     return false;
   }
 
@@ -306,9 +780,23 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
   }
 }
 
-For VectorizeLoop(const For &loop, int vectorize_hint) {
+For VectorizeLoop(const For &loop, const LayoutMap &layout_map,
+                  int vectorize_hint) {
+  if (vectorize_hint <= 0) {
+    arith::Analyzer analyzer;
+    VectorizePlanner planner(&analyzer, layout_map);
+    vectorize_hint = planner.Plan(loop);
+  }
+  if (vectorize_hint == 1)
+    return loop;
+  auto rewriter = VectorizeRewriter(vectorize_hint);
+  return Downcast<For>(rewriter(loop));
+}
+
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  const LayoutMap &layout_map, int vectorize_hint) {
   if (vectorize_hint <= 0) {
-    VectorizePlanner planner;
+    VectorizePlanner planner(analyzer, layout_map);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index 4ab20c668..214d703f0 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -25,6 +25,7 @@
 #ifndef TVM_TL_LOOP_VECTORIZE_H_
 #define TVM_TL_LOOP_VECTORIZE_H_
 
+#include "../op/operator.h"
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/op.h>
 
@@ -33,17 +34,30 @@ namespace tl {
 
 using namespace tir;
 
-int GetVectorizeSize(const For &loop);
+int GetVectorizeSize(const For &loop, const LayoutMap &layout_map = {});
 
-For VectorizeLoop(const For &loop, int vectorize_hint = -1);
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer,
+                     const LayoutMap &layout_map = {});
+
+For VectorizeLoop(const For &loop, const LayoutMap &layout_map = {},
+                  int vectorize_hint = -1);
+
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  const LayoutMap &layout_map = {}, int vectorize_hint = -1);
 
 // Can prove expr is independent with var, i.e. the value of expr doesn't change
 // when var changes
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer);
-bool IndiceCanVectorize(const PrimExpr &expr, Var var,
-                        const PrimExpr &iter_var_size,
-                        int target_vectorized_size, arith::Analyzer *analyzer);
+
+// Check if expr is invariant within vector boundaries
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer);
+
+bool IndicesCanVectorize(const PrimExpr &expr, Var var,
+                         const PrimExpr &iter_var_size,
+                         int target_vectorized_size, arith::Analyzer *analyzer);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/loop_vectorize_dynamic.cc b/src/transform/loop_vectorize_dynamic.cc
deleted file mode 100644
index c72af5a07..000000000
--- a/src/transform/loop_vectorize_dynamic.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/*!
- * \file loop_vectorize_dynamic.cc
- * \brief A tool to automatically vectorize a for loop with dynamic shape
- * \brief Reference to loop_vectorize.cc and vectorize_loop.cc
- */
-
-#include <cstdint>
-#include <tvm/arith/iter_affine_map.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <numeric>
-#include <utility>
-
-#include "../layout/layout.h"
-#include "../layout/utils.h"
-#include "../op/builtin.h"
-#include "arith/int_operator.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include "common/loop_vectorization_utils.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-
-struct VectorizePlanResult {
-  int vector_size;
-  bool dynamic;
-  PrimExpr condition;
-};
-
-bool IndiceCanVectorizeDynamic(const PrimExpr &expr, Var var,
-                               const PrimExpr &iter_var_size,
-                               int target_vectorized_size,
-                               arith::Analyzer *analyzer) {
-  ICHECK(target_vectorized_size >= 1);
-  if (target_vectorized_size == 1)
-    return true;
-  if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
-                               0))
-    return false;
-  Var v0("v0"), v1("v1");
-  analyzer->Bind(v0, Range(0, target_vectorized_size));
-  analyzer->Bind(v1, Range(0, FloorDiv(iter_var_size, target_vectorized_size)));
-  PrimExpr expr_transformed = analyzer->Simplify(
-      Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
-
-  Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
-  PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
-  auto ramp_node = expr_vectorized.as<RampNode>();
-  if (!ramp_node) {
-    // Broadcast value
-    if (expr_vectorized.dtype().lanes() == 1)
-      return true;
-    else
-      return false;
-  } else {
-    return is_one(ramp_node->stride);
-  }
-}
-
-class VectorizePlannerDynamic : public arith::IRVisitorWithAnalyzer {
-public:
-  VectorizePlannerDynamic(int dynamic_alignment,
-                          bool disable_dynamic_tail_split)
-      : dynamic_alignment_(dynamic_alignment),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split),
-        vector_load_bits_max_(128) {
-    if (disable_dynamic_tail_split_) {
-      vector_size_ = dynamic_alignment_;
-    } else {
-      vector_size_ = vector_load_bits_max_;
-    }
-  }
-
-  int Plan(const For &node) {
-    this->operator()(node);
-    // Always Enable vectorization
-    // if (!has_nonlocal_memory_access_) return 1;
-    return vector_size_;
-  }
-
-  bool GetDynamic() { return dynamic_; }
-
-  PrimExpr GetCondition() { return condition_; }
-
-private:
-  void VisitStmt_(const ForNode *node) final {
-    inner_for_ = node;
-    iter_map_.Set(node->loop_var, Range(node->min, node->extent));
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
-      has_nonlocal_memory_access_ = true;
-    if (node->buffer->shape.size() == 1) {
-      // TODO(lei): This should be improved as
-      // constant buffer that tl hack to use as local register.
-      auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
-      if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-      }
-    }
-    UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-  }
-
-  void VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
-      has_nonlocal_memory_access_ = true;
-    UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitStmt_(const IfThenElseNode *node) final {
-    CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitExpr_(const CallNode *node) final {
-    if (node->op == builtin::if_then_else()) {
-      CheckConditionVectorized(node->args[0]);
-    } else if (node->op == builtin::call_extern()) {
-      // do not vectorize extern calls
-      vector_size_ = 1;
-    }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-  }
-
-  void CheckConditionVectorized(const PrimExpr &cond) {
-    // TODO: may perform some checks here
-  }
-
-  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer) {
-    if (!inner_for_)
-      return;
-    auto extent_ptr = inner_for_->extent.as<IntImmNode>();
-    if (!extent_ptr)
-      return;
-
-    const DataType &access_type = buffer->dtype;
-    // i // 2, i % 8 can also be vectorized as factor 16
-    int max_vector_size = vector_load_bits_max_ / access_type.bits();
-
-    // so we should disable this GCD optimization
-    max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
-
-    auto last_dim = buffer->shape.back();
-    auto mod_set = analyzer_.modular_set(last_dim);
-    // when dynamic shape like [m, k]: coeff=1, base=0, GCD will block
-    // conditionally tail vectorize
-    if (buffer->shape.back().as<IntImmNode>()) {
-      max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
-
-      auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
-      // If gcd_base is equal to the last dimension,
-      // we should analyze the second-to-last dimension
-      // in relation to the last dimension.
-      if (gcd_base < Downcast<IntImm>(last_dim)->value) {
-        max_vector_size = gcd_base;
-      }
-
-      vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
-
-      PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
-      }
-      while (!IndiceCanVectorizeDynamic(elem_offset, inner_for_->loop_var,
-                                        inner_for_->extent, vector_size_,
-                                        &analyzer_)) {
-        vector_size_ /= 2;
-      }
-    } else {
-      // dynamic shape load: get the vectorization condition
-      dynamic_ = true;
-      if (!disable_dynamic_tail_split_ &&
-          vector_size_ >= vector_load_bits_max_ / buffer->dtype.bits()) {
-        vector_size_ = vector_load_bits_max_ / buffer->dtype.bits();
-      }
-      PrimExpr offset = buffer.OffsetOf(indices).back();
-      // condition for alignment, maybe useless
-      condition_ = (FloorMod(offset, vector_size_) == 0);
-    }
-  }
-
-  // Use dynamic alignment from pass config
-  int vector_load_bits_max_;
-  int dynamic_alignment_;
-  bool disable_dynamic_tail_split_;
-
-  int vector_size_;
-
-  const ForNode *inner_for_{};
-  Map<Var, Range> iter_map_;
-  bool has_nonlocal_memory_access_ = false;
-  // conditionally vectorize
-  bool dynamic_ = false;
-  PrimExpr condition_;
-};
-
-class VectorizedBodyMutator : public StmtExprMutator {
-public:
-  VectorizedBodyMutator(Var inner_var, int vector_size,
-                        std::vector<PrimExpr> conditions)
-      : inner_var_(std::move(inner_var)), vector_size_(vector_size),
-        conditions_(std::move(conditions)) {}
-
-private:
-  PrimExpr VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(builtin::if_then_else())) {
-      // TODO: Currently not ramp, but only reserve the "then" part (because
-      // conditions are move outside this vectorized loop)
-      PrimExpr ifexpr = op->args[0];
-      PrimExpr thenexpr = op->args[1];
-      bool flag = false;
-      for (auto &cond : conditions_) {
-        if (ifexpr.get() == cond.get()) {
-          flag = true;
-        }
-      }
-      if (flag) {
-        return thenexpr;
-      } else {
-        return tvm::ffi::GetRef<PrimExpr>(op);
-      }
-    } else {
-      return tvm::ffi::GetRef<PrimExpr>(op);
-    }
-  }
-
-  Var inner_var_;
-  int vector_size_;
-  std::vector<PrimExpr> conditions_;
-};
-
-class VectorizedConditionExtractor : public StmtExprVisitor {
-public:
-  VectorizedConditionExtractor() = default;
-  std::vector<PrimExpr> GetConditions(const Stmt &body) {
-    this->VisitStmt(body);
-    return conditions_;
-  }
-
-private:
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(builtin::if_then_else())) {
-      PrimExpr cond = op->args[0];
-      conditions_.emplace_back(cond);
-    }
-    StmtExprVisitor::VisitExpr_(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode *node) final {
-    conditions_.emplace_back(node->condition);
-    StmtExprVisitor::VisitStmt_(node);
-  }
-
-  std::vector<PrimExpr> conditions_;
-};
-
-// backward-compatibility: extracter -> extractor
-using VectorizedConditionExtracter = VectorizedConditionExtractor;
-
-class NestedLoopChecker : public StmtExprVisitor {
-public:
-  NestedLoopChecker() : loop_num_(0) {}
-  int GetNestLoopNum(const Stmt &body) {
-    this->VisitStmt(body);
-    return loop_num_;
-  }
-
-private:
-  void VisitStmt_(const ForNode *node) final {
-    loop_num_++;
-    StmtExprVisitor::VisitStmt_(node);
-  }
-  int loop_num_;
-};
-
-// Modify every subexpression in the condition
-class VectorizedConditionMutator : public StmtExprMutator {
-public:
-  VectorizedConditionMutator(Var inner_var, int extent)
-      : inner_var_(std::move(inner_var)), vector_size_(extent) {}
-
-private:
-  PrimExpr VisitExpr_(const GENode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, 0);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return GE(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const GTNode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, 0);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return GT(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const LENode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, 0);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return LE(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const LTNode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, 0);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return LT(lhs_bound, rhs_bound, span);
-  }
-
-  Var inner_var_;
-  int vector_size_;
-};
-
-class VectorizeRewriterDynamic : public StmtExprMutator {
-public:
-  VectorizeRewriterDynamic(const VectorizePlanResult &plan,
-                           bool disable_dynamic_tail_split)
-      : vector_size_(plan.vector_size), condition_(plan.condition),
-        dynamic_(plan.dynamic),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split) {}
-
-private:
-  Stmt VisitStmt_(const ForNode *node) final {
-    // Get pass config `tl.disable_dynamic_tail_split`
-    tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-    Optional<Bool> opt_disable_dynamic_tail_split =
-        ctxt->GetConfig(kDisableDynamicTailSplit, Optional<Bool>());
-    bool disable_dynamic_tail_split =
-        opt_disable_dynamic_tail_split.value_or(Bool(false));
-
-    inner_for_ = node;
-    auto ret = StmtExprMutator::VisitStmt_(node);
-    if (inner_for_ != node) {
-      return ret;
-    }
-    For fnode = ret.as<For>().value();
-    auto old_var = fnode->loop_var;
-    if (!fnode->extent.as<IntImmNode>()) {
-      return ret;
-    }
-    int extent = Downcast<IntImm>(fnode->extent)->value;
-
-    if (!dynamic_) {
-      return fnode;
-    }
-
-    if (!disable_dynamic_tail_split) {
-      // To handle the fact that cp.async only support address aligned with
-      // access size
-      vector_size_ = 1;
-    }
-
-    ICHECK(extent % vector_size_ == 0)
-        << "extent: " << extent << " vector_size_: " << vector_size_;
-    ICHECK(is_zero(fnode->min));
-    Var inner_var = Var("vec");
-    Var outer_var = Var(old_var->name_hint);
-    Map<Var, PrimExpr> vmap;
-    vmap.Set(fnode->loop_var, outer_var * vector_size_ + inner_var);
-    Stmt body = Substitute(fnode->body, vmap);
-
-    VectorizedConditionExtractor extractor;
-    std::vector<PrimExpr> conditions = extractor.GetConditions(body);
-
-    VectorizedConditionMutator condition_mutator(inner_var, vector_size_);
-
-    // Adaptively set vectorized variable to the min/max value of the extent
-    PrimExpr condition_bound;
-    if (!conditions.empty()) {
-      condition_bound = condition_mutator(conditions[0]);
-      for (int i = 1; i < conditions.size(); ++i) {
-        condition_bound = condition_bound && condition_mutator(conditions[i]);
-      }
-    }
-
-    if (!disable_dynamic_tail_split) {
-      // If dynamic_tail_split is true, we will vectorize the loop with
-      // if-then-else conditions modify body in the vectorized loop
-      VectorizedBodyMutator mutator(inner_var, vector_size_, conditions);
-      Stmt vectorize_body = mutator(body);
-
-      // add condition ifthenelse here
-      For vectorize_for =
-          For(inner_var, 0, vector_size_, ForKind::kVectorized, vectorize_body);
-      For serial_for = For(inner_var, 0, vector_size_, ForKind::kSerial, body);
-      if (!conditions.empty()) {
-        body = IfThenElse(condition_bound, vectorize_for, serial_for);
-      } else {
-        body = vectorize_for;
-      }
-      body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                 fnode->thread_binding, fnode->annotations, fnode->span);
-      return body;
-    } else {
-      // If dynamic_tail_split is false, we will directly vectorize the loop
-      // without dynamic tail split and if_then_else, which may lead to error
-      VectorizedBodyMutator mutator(inner_var, vector_size_, conditions);
-      Stmt vectorize_body = mutator(body);
-
-      For vectorize_for =
-          For(inner_var, 0, vector_size_, ForKind::kVectorized, vectorize_body);
-      body =
-          For(outer_var, 0, extent / vector_size_, fnode->kind, vectorize_for,
-              fnode->thread_binding, fnode->annotations, fnode->span);
-      return body;
-    }
-  }
-
-  const ForNode *inner_for_{};
-  int vector_size_;
-  const PrimExpr condition_;
-  const bool dynamic_;
-  const bool disable_dynamic_tail_split_;
-};
-
-VectorizePlanResult
-GetVectorizePlanResultDynamic(const For &loop, int dynamic_alignment,
-                              bool disable_dynamic_tail_split) {
-  VectorizePlannerDynamic planner(dynamic_alignment,
-                                  disable_dynamic_tail_split);
-  int vector_size = planner.Plan(loop);
-  bool dynamic = planner.GetDynamic();
-  PrimExpr condition = planner.GetCondition();
-  return {vector_size, dynamic, condition};
-}
-
-class LoopVectorizerDynamic : public IRMutatorWithAnalyzer {
-public:
-  static Stmt Substitute(Stmt stmt, bool disable_dynamic_tail_split,
-                         int dynamic_alignment) {
-    arith::Analyzer analyzer;
-    LoopVectorizerDynamic substituter(&analyzer, disable_dynamic_tail_split,
-                                      dynamic_alignment);
-    stmt = substituter.VisitStmt(stmt);
-    return stmt;
-  }
-
-private:
-  LoopVectorizerDynamic(arith::Analyzer *analyzer,
-                        bool disable_dynamic_tail_split, int dynamic_alignment)
-      : arith::IRMutatorWithAnalyzer(analyzer),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split),
-        dynamic_alignment_(dynamic_alignment) {}
-
-  Stmt VisitStmt_(const ForNode *op) final {
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    VectorizePlanResult res{vector_load_bits_max_, false, 0};
-    res = GetVectorizePlanResultDynamic(for_node, dynamic_alignment_,
-                                        disable_dynamic_tail_split_);
-    NestedLoopChecker checker;
-    int nest_num = checker.GetNestLoopNum(for_node);
-    if (nest_num > 1 ||
-        for_node->kind == ForKind::kVectorized) { // only rewrite the innermost
-                                                  // non-vectorized loop
-      return for_node;
-    }
-    auto rewriter = VectorizeRewriterDynamic(res, disable_dynamic_tail_split_);
-    return Downcast<For>(rewriter(for_node));
-  }
-
-  const int vector_load_bits_max_ = 128;
-  int dynamic_alignment_;
-  bool disable_dynamic_tail_split_;
-};
-
-class VectorizeSkipperDynamic : public StmtMutator {
-public:
-  Stmt VisitStmt_(const ForNode *op) final {
-    Stmt stmt = StmtMutator::VisitStmt_(op);
-    op = stmt.as<ForNode>();
-    if (op->kind == ForKind::kVectorized) {
-      return For(op->loop_var, op->min, op->extent, ForKind::kSerial, op->body);
-    } else {
-      return stmt;
-    }
-  }
-};
-
-tvm::transform::Pass LoopVectorizeDynamic() {
-  using namespace tir::transform;
-  auto pass_func = [=](PrimFunc f, const IRModule &m, PassContext ctx) {
-    bool disable_dynamic_tail_split =
-        ctx->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
-    int dynamic_alignment =
-        (int)(ctx->GetConfig<Integer>(kDynamicAlignment, Integer(8))
-                  .value_or(Integer(8))
-                  ->value);
-    // Ensure tl.dynamic_alignment is a power of 2
-    if (disable_dynamic_tail_split &&
-        ((dynamic_alignment & (dynamic_alignment - 1)) != 0)) {
-      LOG(FATAL) << "tl.dynamic_alignment must be a power of 2, but got "
-                 << dynamic_alignment;
-    }
-    auto *n = f.CopyOnWrite();
-    n->body = LoopVectorizerDynamic::Substitute(
-        std::move(n->body), disable_dynamic_tail_split, dynamic_alignment);
-    return f;
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.LoopVectorizeDynamic", {});
-}
-
-// Register the pass globally so it can be used in the compilation pipeline
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.LoopVectorizeDynamic",
-                        LoopVectorizeDynamic);
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index b082a574e..a9798a7a3 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -12,6 +12,7 @@
 
 #include "../op/builtin.h"
 #include "../runtime/runtime.h"
+#include "./common/mbarrier.h"
 
 namespace tvm {
 namespace tl {
@@ -26,10 +27,13 @@ class LowerHopperIntrin : public StmtExprMutator {
     LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
     Map<Var, Array<PrimExpr>> init_desc_arg_map;
+    // Collect prologue/epilogue statements for host-side setup/teardown
+    Array<Stmt> prologue_stmts;
+    Array<Stmt> epilogue_stmts;
     for (const auto &[call, var] : substituter.desc_map_) {
       // Should allocate 128 bytes for TensorMap on stack
       Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
-                             {StringImm("arg_value"), 16});
+                             {StringImm("tvm_ffi_any"), 16});
       Array<PrimExpr> init_desc_args;
       if (call->op.same_as(create_tma_descriptor())) {
         init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
@@ -44,11 +48,66 @@ class LowerHopperIntrin : public StmtExprMutator {
       // add to function attribute
       Call init_desc =
           Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
-      fptr->body =
-          LetStmt(var, alloc_desc, SeqStmt({Evaluate(init_desc), fptr->body}));
+      // Accumulate TMA descriptor init into prologue
+      prologue_stmts.push_back(LetStmt(var, alloc_desc, Evaluate(init_desc)));
       init_desc_arg_map.Set(var, init_desc_args);
     }
     f = WithAttr(std::move(f), "tma_descriptor_args", init_desc_arg_map);
+
+    // Additionally, if L2 persistent cache annotations were lowered earlier,
+    // materialize TVM FFI calls to set the stream access policy window.
+    if (f->attrs.defined() && f->attrs->dict.count("l2_persistent_map")) {
+      auto l2_map =
+          f->GetAttr<Map<String, Array<PrimExpr>>>("l2_persistent_map");
+      if (l2_map.defined()) {
+        // Build a lookup from buffer name to Buffer object
+        std::unordered_map<std::string, Buffer> name2buf;
+        for (const auto &kv : f->buffer_map) {
+          name2buf.emplace(kv.second->name, kv.second);
+        }
+        for (const auto &kv : l2_map.value()) {
+          const std::string buf_name = kv.first;
+          const Array<PrimExpr> &args = kv.second;
+          if (name2buf.count(buf_name) == 0) {
+            continue;
+          }
+          const Buffer &buf = name2buf.at(buf_name);
+          // Build base pointer expression (read access)
+          PrimExpr base_ptr = buf.access_ptr(1);
+          // Args packed: func_name, base_ptr, num_bytes, hit_ratio
+          Array<PrimExpr> packed_args;
+          packed_args.push_back(
+              StringImm(tvm_cuda_stream_set_access_policy_window));
+          packed_args.push_back(base_ptr);
+          // size_in_bytes (args[1]) then hit_ratio (args[0])
+          ICHECK_GE(args.size(), 2);
+          packed_args.push_back(args[1]);
+          packed_args.push_back(args[0]);
+          prologue_stmts.push_back(Evaluate(Call(
+              DataType::Int(32), builtin::tvm_call_packed(), packed_args)));
+        }
+        // Add a single epilogue call to reset the access policy window and
+        // restore L2 limit
+        Array<PrimExpr> reset_args;
+        reset_args.push_back(
+            StringImm(tvm_cuda_stream_reset_access_policy_window));
+        epilogue_stmts.push_back(Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), reset_args)));
+      }
+    }
+
+    // Stitch prologue statements before the original body
+    if (!prologue_stmts.empty()) {
+      // Chain the Let/Evaluate statements sequentially
+      Stmt seq = prologue_stmts.size() == 1 ? prologue_stmts[0]
+                                            : SeqStmt(prologue_stmts);
+      fptr->body = SeqStmt({seq, fptr->body});
+    }
+    if (!epilogue_stmts.empty()) {
+      Stmt seq_end = epilogue_stmts.size() == 1 ? epilogue_stmts[0]
+                                                : SeqStmt(epilogue_stmts);
+      fptr->body = SeqStmt({fptr->body, seq_end});
+    }
     return f;
   }
 
@@ -63,12 +122,6 @@ class LowerHopperIntrin : public StmtExprMutator {
           return AttrStmt(op->node, op->attr_key, op->value, body);
         } else {
           Array<Stmt> stmt_seq;
-          if (!init_mbarrier_calls_.empty()) {
-            auto alloc_mbarrier =
-                Evaluate(Call(DataType::Handle(), builtin::create_barriers(),
-                              {static_cast<int>(init_mbarrier_calls_.size())}));
-            stmt_seq.push_back(alloc_mbarrier);
-          }
 
           auto stmts = prefetch_calls_;
           stmts.insert(stmts.end(), init_mbarrier_calls_.begin(),
@@ -100,9 +153,19 @@ class LowerHopperIntrin : public StmtExprMutator {
           }
           stmt_seq.push_back(body);
 
+          Stmt result = SeqStmt(stmt_seq);
+
+          if (!init_mbarrier_calls_.empty()) {
+            mbarrier_buffer_ = CreateMBarrierBuffer(
+                injected_mbarrier_name_, init_mbarrier_calls_.size());
+            result = DeclBuffer(mbarrier_buffer_, result);
+            result = Allocate(mbarrier_buffer_->data, mbarrier_buffer_->dtype,
+                              mbarrier_buffer_->shape, const_true(), result);
+          }
+
           prefetch_calls_.clear();
           init_mbarrier_calls_.clear();
-          return AttrStmt(op->node, op->attr_key, op->value, SeqStmt(stmt_seq));
+          return AttrStmt(op->node, op->attr_key, op->value, result);
         }
       }
     }
@@ -148,6 +211,7 @@ class LowerHopperIntrin : public StmtExprMutator {
   LowerHopperIntrin(bool disable_shuffle_elect)
       : disable_shuffle_elect_(disable_shuffle_elect) {}
   bool disable_shuffle_elect_;
+  Buffer mbarrier_buffer_;
 };
 
 using namespace tir::transform;
diff --git a/src/transform/lower_intrin.cc b/src/transform/lower_intrin.cc
index edd0e1a18..cf312264d 100644
--- a/src/transform/lower_intrin.cc
+++ b/src/transform/lower_intrin.cc
@@ -122,8 +122,14 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
         return truncdiv(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floordiv
       // in terms of truncdiv using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -165,6 +171,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncdiv(offset_numerator, op->b) - ceildiv;
       }
+      */
 
       DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
       PrimExpr rdiv = truncdiv(op->a, op->b);
@@ -223,8 +230,14 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
         return truncmod(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floormod
       // in terms of truncmod using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -265,6 +278,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncmod(offset_numerator, op->b);
       }
+      */
 
       DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
       // NOTE:condition on b >= 0.
diff --git a/src/transform/lower_ldg_stg.cc b/src/transform/lower_ldg_stg.cc
new file mode 100644
index 000000000..abcdfb6e7
--- /dev/null
+++ b/src/transform/lower_ldg_stg.cc
@@ -0,0 +1,540 @@
+/*!
+ * \file lower_ldg_stg.cc
+ * \brief Lower Ramp-based global memory load/store to ldg/stg intrinsics
+ *
+ * This pass transforms vectorized global memory loads and stores (using Ramp
+ * indices) into explicit ldg32/64/128/256 and stg32/64/128/256 intrinsics for
+ * better codegen.
+ *
+ * Key behaviors:
+ * 1. Converts Ramp-based global BufferLoad to ldg intrinsics
+ * 2. Converts Ramp-based global BufferStore to stg intrinsics
+ * 3. Supports predicated loads (if_then_else with else=0)
+ * 4. Supports predicated stores (if_then_else with empty then case)
+ * 5. Skips loads in async scope (will be lowered to cp.async)
+ * 6. Only enabled for CUDA targets
+ *
+ * Pass configurations:
+ * - tl.enable_lower_ldgstg: Enable non-predicated ldg/stg lowering (default:
+ * OFF)
+ * - tl.enable_lower_ldgstg_predicated: Enable predicated ldg/stg lowering
+ * (default: OFF)
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "tir/ir/buffer_common.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+class LowerLDGSTGRewriter : public StmtExprMutator {
+public:
+  explicit LowerLDGSTGRewriter(bool enable_non_predicated,
+                               bool enable_predicated)
+      : enable_non_predicated_(enable_non_predicated),
+        enable_predicated_(enable_predicated) {}
+
+  Stmt VisitStmt_(const AttrStmtNode *attr) final {
+    if (attr->attr_key == tir::attr::async_scope) {
+      // Mark that we're inside an async scope
+      bool old_in_async = in_async_scope_;
+      in_async_scope_ = true;
+      auto body = this->VisitStmt(attr->body);
+      in_async_scope_ = old_in_async;
+      if (body.same_as(attr->body)) {
+        return tvm::ffi::GetRef<Stmt>(attr);
+      }
+      return AttrStmt(attr->node, attr->attr_key, attr->value, body);
+    }
+    return StmtExprMutator::VisitStmt_(attr);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *store) final {
+    // Skip if non-predicated lowering is disabled
+    if (!enable_non_predicated_) {
+      return StmtExprMutator::VisitStmt_(store);
+    }
+
+    // Only handle global memory stores
+    if (store->buffer.scope() != "global") {
+      return StmtExprMutator::VisitStmt_(store);
+    }
+
+    // Assume buffer has been flattened by FlattenBuffer pass
+    ICHECK(store->indices.size() == 1)
+        << "Expected flattened buffer with single index, but got "
+        << store->indices.size() << " indices for buffer "
+        << store->buffer->name;
+
+    // Check if this is a Ramp-based store (vectorized)
+    if (store->indices[0]->IsInstance<RampNode>()) {
+      auto ramp = store->indices[0].as<RampNode>();
+      // Check if stride is 1 (contiguous access)
+      if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+        if (stride_imm->value == 1) {
+          // Get lanes from the index dtype
+          int lanes = store->indices[0]->dtype.lanes();
+          // Use bits() to correctly handle sub-byte types like float4_e2m1fn
+          int total_bits = lanes * store->buffer->dtype.bits();
+
+          // Check for supported vector widths (32/64/128/256 bits)
+          if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+              total_bits == 256) {
+            return LowerToSTG(store, ramp->base, total_bits);
+          }
+        }
+      }
+    } else {
+      // Single element store (non-Ramp)
+      int bits = store->buffer->dtype.bits();
+      if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+        return LowerToSTG(store, store->indices[0], bits);
+      }
+    }
+
+    // Check if store value is an if_then_else with empty then case (predicated
+    // store) This pattern appears as: if (pred) { store } which gets lowered to
+    // BufferStore with if_then_else in the IfThenElse statement handling
+    return StmtExprMutator::VisitStmt_(store);
+  }
+
+  Stmt VisitStmt_(const IfThenElseNode *if_stmt) final {
+    // Skip if predicated lowering is disabled
+    if (!enable_predicated_) {
+      return StmtExprMutator::VisitStmt_(if_stmt);
+    }
+
+    // Check for predicated store pattern:
+    // if (pred) { } else { BufferStore(...) }
+    // This represents a store that only happens when pred is false
+    // We convert this to stg with predicate = !pred
+
+    // Actually, the more common pattern for predicated store is:
+    // if (pred) { BufferStore(...) }
+    // which means store only when pred is true
+    if (!if_stmt->else_case.defined()) {
+      // Check if then_case is a single BufferStore to global memory with Ramp
+      if (auto seq = if_stmt->then_case.as<SeqStmtNode>()) {
+        // Multiple statements, not a simple predicated store
+        return StmtExprMutator::VisitStmt_(if_stmt);
+      }
+
+      if (auto store = if_stmt->then_case.as<BufferStoreNode>()) {
+        // Only handle global memory stores
+        if (store->buffer.scope() == "global") {
+          // Assume buffer has been flattened by FlattenBuffer pass
+          ICHECK(store->indices.size() == 1)
+              << "Expected flattened buffer with single index, but got "
+              << store->indices.size() << " indices for buffer "
+              << store->buffer->name;
+
+          // Check if this is a Ramp-based store (vectorized)
+          if (store->indices[0]->IsInstance<RampNode>()) {
+            auto ramp = store->indices[0].as<RampNode>();
+            // Check if stride is 1 (contiguous access)
+            if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+              if (stride_imm->value == 1) {
+                // Get lanes from the index dtype
+                int lanes = store->indices[0]->dtype.lanes();
+                // Use bits() to correctly handle sub-byte types like
+                // float4_e2m1fn
+                int total_bits = lanes * store->buffer->dtype.bits();
+
+                // Check for supported vector widths (32/64/128/256 bits)
+                if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+                    total_bits == 256) {
+                  return LowerToSTGPredicated(store, ramp->base, total_bits,
+                                              if_stmt->condition);
+                }
+              }
+            }
+          } else {
+            // Single element predicated store (non-Ramp)
+            int bits = store->buffer->dtype.bits();
+            if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+              return LowerToSTGPredicated(store, store->indices[0], bits,
+                                          if_stmt->condition);
+            }
+          }
+        }
+      }
+    }
+
+    return StmtExprMutator::VisitStmt_(if_stmt);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *load) final {
+    // Skip loads in async scope (will be lowered to cp.async)
+    if (in_async_scope_) {
+      return StmtExprMutator::VisitExpr_(load);
+    }
+
+    // Only handle global memory loads
+    if (load->buffer.scope() != "global") {
+      return StmtExprMutator::VisitExpr_(load);
+    }
+
+    // Check if we're in a predicated context (from IfThenElse store pattern)
+    // In this case, we need to use predicated load regardless of
+    // enable_non_predicated_
+    bool use_predicated = current_predicate_.defined();
+
+    // Skip if non-predicated lowering is disabled and we're not in predicated
+    // context
+    if (!enable_non_predicated_ && !use_predicated) {
+      return StmtExprMutator::VisitExpr_(load);
+    }
+
+    // Assume buffer has been flattened by FlattenBuffer pass
+    ICHECK(load->indices.size() == 1)
+        << "Expected flattened buffer with single index, but got "
+        << load->indices.size() << " indices for buffer " << load->buffer->name;
+
+    // Check if this is a Ramp-based load (vectorized)
+    if (load->indices[0]->IsInstance<RampNode>()) {
+      auto ramp = load->indices[0].as<RampNode>();
+      // Check if stride is 1 (contiguous access)
+      if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+        if (stride_imm->value == 1) {
+          // Get lanes from the index dtype
+          int lanes = load->indices[0]->dtype.lanes();
+          // Use bits() to correctly handle sub-byte types like float4_e2m1fn
+          int total_bits = lanes * load->buffer->dtype.bits();
+
+          // Check for supported vector widths (32/64/128/256 bits)
+          if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+              total_bits == 256) {
+            if (use_predicated) {
+              return LowerToLDGPredicated(load, ramp->base, total_bits,
+                                          current_predicate_.value());
+            }
+            return LowerToLDG(load, ramp->base, total_bits);
+          }
+        }
+      }
+    } else {
+      // Single element load (non-Ramp)
+      int bits = load->buffer->dtype.bits();
+      if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+        if (use_predicated) {
+          return LowerToLDGPredicated(load, load->indices[0], bits,
+                                      current_predicate_.value());
+        }
+        return LowerToLDG(load, load->indices[0], bits);
+      }
+    }
+
+    return StmtExprMutator::VisitExpr_(load);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *call) final {
+    // Skip if predicated lowering is disabled
+    if (!enable_predicated_) {
+      return StmtExprMutator::VisitExpr_(call);
+    }
+
+    // Check for if_then_else pattern for predicated loads
+    if (call->op.same_as(builtin::if_then_else()) && call->args.size() == 3) {
+      // Skip if in async scope
+      if (in_async_scope_) {
+        return StmtExprMutator::VisitExpr_(call);
+      }
+
+      PrimExpr condition = call->args[0];
+      PrimExpr then_value = call->args[1];
+      PrimExpr else_value = call->args[2];
+
+      // Check if else value is zero (required for predicated ldg)
+      bool else_is_zero = false;
+      if (auto bcast = else_value.as<BroadcastNode>()) {
+        if (auto f = bcast->value.as<FloatImmNode>()) {
+          else_is_zero = (f->value == 0.0f);
+        } else if (auto i = bcast->value.as<IntImmNode>()) {
+          else_is_zero = (i->value == 0);
+        }
+      } else if (auto f = else_value.as<FloatImmNode>()) {
+        else_is_zero = (f->value == 0.0f);
+      } else if (auto i = else_value.as<IntImmNode>()) {
+        else_is_zero = (i->value == 0);
+      }
+
+      if (else_is_zero) {
+        // Check if then_value is a BufferLoad from global memory with Ramp
+        if (auto load = then_value.as<BufferLoadNode>()) {
+          if (load->buffer.scope() == "global") {
+            // Assume buffer has been flattened by FlattenBuffer pass
+            ICHECK(load->indices.size() == 1)
+                << "Expected flattened buffer with single index, but got "
+                << load->indices.size() << " indices for buffer "
+                << load->buffer->name;
+
+            if (load->indices[0]->IsInstance<RampNode>()) {
+              auto ramp = load->indices[0].as<RampNode>();
+              if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+                if (stride_imm->value == 1) {
+                  // Get lanes from the index dtype
+                  int lanes = load->indices[0]->dtype.lanes();
+                  // Use bits() to correctly handle sub-byte types like
+                  // float4_e2m1fn
+                  int total_bits = lanes * load->buffer->dtype.bits();
+
+                  // Check for supported vector widths (32/64/128/256 bits)
+                  if (total_bits == 32 || total_bits == 64 ||
+                      total_bits == 128 || total_bits == 256) {
+                    return LowerToLDGPredicated(load, ramp->base, total_bits,
+                                                condition);
+                  }
+                }
+              }
+            } else {
+              // Single element predicated load (non-Ramp)
+              int bits = load->buffer->dtype.bits();
+              if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+                return LowerToLDGPredicated(load, load->indices[0], bits,
+                                            condition);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return StmtExprMutator::VisitExpr_(call);
+  }
+
+private:
+  bool in_async_scope_{false};
+  bool enable_non_predicated_{false};
+  bool enable_predicated_{true};
+  Optional<PrimExpr>
+      current_predicate_; // Track predicate context for nested loads
+
+  // Create access pointer for the buffer at given base offset
+  PrimExpr CreateAccessPtr(const Buffer &buffer, const PrimExpr &base,
+                           int access_mask) {
+    // access_mask: 1 = read, 2 = write
+    return buffer.access_ptr(access_mask, DataType::Handle(), 1, base);
+  }
+
+  // Lower a BufferLoad to ldg intrinsic
+  PrimExpr LowerToLDG(const BufferLoadNode *load, const PrimExpr &base,
+                      int bits) {
+    PrimExpr ptr = CreateAccessPtr(load->buffer, base, 1);
+
+    DataType ret_dtype;
+    Op ldg_op;
+    if (bits == 32) {
+      ret_dtype = DataType::UInt(32);
+      ldg_op = ldg32();
+    } else if (bits == 64) {
+      ret_dtype = DataType::UInt(32, 2);
+      ldg_op = ldg64();
+    } else if (bits == 128) {
+      ret_dtype = DataType::UInt(32, 4);
+      ldg_op = ldg128();
+    } else if (bits == 256) {
+      ret_dtype = DataType::UInt(32, 8);
+      ldg_op = ldg256();
+    } else {
+      LOG(FATAL) << "Unsupported bit width for ldg: " << bits;
+      return PrimExpr();
+    }
+
+    // Create ldg call
+    PrimExpr ldg_result = Call(ret_dtype, ldg_op, {ptr});
+
+    // Reinterpret to the original dtype if needed
+    if (load->dtype != ret_dtype) {
+      return Call(load->dtype, builtin::reinterpret(), {ldg_result});
+    }
+    return ldg_result;
+  }
+
+  // Lower a predicated BufferLoad to ldg intrinsic
+  PrimExpr LowerToLDGPredicated(const BufferLoadNode *load,
+                                const PrimExpr &base, int bits,
+                                const PrimExpr &predicate) {
+    PrimExpr ptr = CreateAccessPtr(load->buffer, base, 1);
+
+    DataType ret_dtype;
+    Op ldg_op;
+    if (bits == 32) {
+      ret_dtype = DataType::UInt(32);
+      ldg_op = ldg32();
+    } else if (bits == 64) {
+      ret_dtype = DataType::UInt(32, 2);
+      ldg_op = ldg64();
+    } else if (bits == 128) {
+      ret_dtype = DataType::UInt(32, 4);
+      ldg_op = ldg128();
+    } else if (bits == 256) {
+      ret_dtype = DataType::UInt(32, 8);
+      ldg_op = ldg256();
+    } else {
+      LOG(FATAL) << "Unsupported bit width for ldg: " << bits;
+      return PrimExpr();
+    }
+
+    // Create predicated ldg call
+    PrimExpr ldg_result = Call(ret_dtype, ldg_op, {ptr, predicate});
+
+    // Reinterpret to the original dtype if needed
+    if (load->dtype != ret_dtype) {
+      return Call(load->dtype, builtin::reinterpret(), {ldg_result});
+    }
+    return ldg_result;
+  }
+
+  // Lower a BufferStore to stg intrinsic
+  Stmt LowerToSTG(const BufferStoreNode *store, const PrimExpr &base,
+                  int bits) {
+    PrimExpr ptr = CreateAccessPtr(store->buffer, base, 2);
+
+    // Get the value to store
+    PrimExpr value = this->VisitExpr(store->value);
+
+    // Reinterpret value to uint32xN if needed
+    DataType store_dtype;
+    const Op *stg_op;
+    switch (bits) {
+    case 32:
+      store_dtype = DataType::UInt(32);
+      stg_op = &stg32();
+      break;
+    case 64:
+      store_dtype = DataType::UInt(32, 2);
+      stg_op = &stg64();
+      break;
+    case 128:
+      store_dtype = DataType::UInt(32, 4);
+      stg_op = &stg128();
+      break;
+    case 256:
+      store_dtype = DataType::UInt(32, 8);
+      stg_op = &stg256();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported bit width for stg: " << bits;
+      return Stmt();
+    }
+
+    // Reinterpret value if dtype doesn't match
+    if (value.dtype() != store_dtype) {
+      value = Call(store_dtype, builtin::reinterpret(), {value});
+    }
+
+    // Create stg call
+    return Evaluate(Call(DataType::Handle(), *stg_op, {ptr, value}));
+  }
+
+  // Lower a predicated BufferStore to stg intrinsic
+  Stmt LowerToSTGPredicated(const BufferStoreNode *store, const PrimExpr &base,
+                            int bits, const PrimExpr &predicate) {
+    PrimExpr ptr = CreateAccessPtr(store->buffer, base, 2);
+
+    // Set predicate context so that nested loads also use predicated version
+    Optional<PrimExpr> old_predicate = current_predicate_;
+    current_predicate_ = predicate;
+
+    // Get the value to store (loads inside will use predicated version)
+    PrimExpr value = this->VisitExpr(store->value);
+
+    // Restore old predicate context
+    current_predicate_ = old_predicate;
+
+    // Reinterpret value to uint32xN if needed
+    DataType store_dtype;
+    const Op *stg_op;
+    switch (bits) {
+    case 32:
+      store_dtype = DataType::UInt(32);
+      stg_op = &stg32();
+      break;
+    case 64:
+      store_dtype = DataType::UInt(32, 2);
+      stg_op = &stg64();
+      break;
+    case 128:
+      store_dtype = DataType::UInt(32, 4);
+      stg_op = &stg128();
+      break;
+    case 256:
+      store_dtype = DataType::UInt(32, 8);
+      stg_op = &stg256();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported bit width for stg: " << bits;
+      return Stmt();
+    }
+
+    // Reinterpret value if dtype doesn't match
+    if (value.dtype() != store_dtype) {
+      value = Call(store_dtype, builtin::reinterpret(), {value});
+    }
+
+    // Create predicated stg call
+    return Evaluate(Call(DataType::Handle(), *stg_op, {ptr, value, predicate}));
+  }
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass LowerLDGSTG() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    // Check if target is CUDA
+    auto target_opt = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!target_opt.defined()) {
+      // No target bound, skip this pass
+      return f;
+    }
+    Target target = target_opt.value();
+    if (target->kind->name != "cuda") {
+      // Not a CUDA target, skip
+      return f;
+    }
+
+    // Check if target has "cutedsl" key - skip for CuTeDSL backend
+    for (const auto &key : target->keys) {
+      if (key == "cutedsl") {
+        return f;
+      }
+    }
+
+    // Read pass configurations
+    // Non-predicated ldg/stg: default OFF
+    bool enable_non_predicated =
+        ctx->GetConfig<Bool>(kEnableLowerLDGSTG, Bool(false)).value();
+    // Predicated ldg/stg: default OFF
+    bool enable_predicated =
+        ctx->GetConfig<Bool>(kEnableLowerLDGSTGPredicated, Bool(false)).value();
+
+    // If both are disabled, skip the pass entirely
+    if (!enable_non_predicated && !enable_predicated) {
+      return f;
+    }
+
+    auto *n = f.CopyOnWrite();
+    n->body =
+        LowerLDGSTGRewriter(enable_non_predicated, enable_predicated)(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerLDGSTG", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerLDGSTG", LowerLDGSTG);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_pdl.cc b/src/transform/lower_pdl.cc
new file mode 100644
index 000000000..13150bf67
--- /dev/null
+++ b/src/transform/lower_pdl.cc
@@ -0,0 +1,92 @@
+/*!
+ * \file lower_pdl.cc
+ * \brief Mark Device PrimFunc with attributes if CUDA PDL functions are called
+ */
+
+#include "../op/builtin.h"
+#include "../target/utils.h"
+#include "common/attr.h"
+#include "tvm/ir/type.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+#include "tvm/tir/stmt.h"
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// NVCC has issues with __ldg when using PDL (Programmatic Dependent Launch)
+// synchronization. Suppress the annotation when kHasGridSync is set.
+class CheckLDGCalls : public StmtExprVisitor {
+public:
+  void VisitExpr_(const tir::CallNode *op) final {
+    if (op->op.same_as(tl::__ldg())) {
+      LOG(FATAL) << "Cannot invoke __ldg function with pdl_sync";
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+};
+
+class MarkCudaSyncCalls : public StmtExprMutator {
+public:
+  static PrimFunc Substitute(PrimFunc f, bool support_pdl) {
+    MarkCudaSyncCalls mutator;
+    PrimFunc new_f = f;
+    new_f.CopyOnWrite()->body = mutator.VisitStmt(f->body);
+
+    if (!support_pdl) {
+      ICHECK(!mutator.has_trigger_launch_ && !mutator.has_grid_sync_)
+          << "PDL is not supported";
+    }
+
+    if (mutator.has_trigger_launch_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasTriggerLaunch, 1);
+    }
+    if (mutator.has_grid_sync_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasGridSync, 1);
+      CheckLDGCalls analyzer;
+      analyzer(f->body);
+    }
+    return new_f;
+  }
+
+  PrimExpr VisitExpr_(const tir::CallNode *op) final {
+    if (op->op.same_as(tl::pdl_trigger())) {
+      has_trigger_launch_ = true;
+    } else if (op->op.same_as(tl::pdl_sync())) {
+      has_grid_sync_ = true;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+private:
+  bool has_trigger_launch_ = false;
+  bool has_grid_sync_ = false;
+
+  MarkCudaSyncCalls() = default;
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass MarkCudaSyncCallsPass(bool support_pdl) {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return MarkCudaSyncCalls::Substitute(f, support_pdl);
+  };
+
+  return CreatePrimFuncPass(pass_func, 0, "tl.MarkCudaSyncCalls", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MarkCudaSyncCalls",
+                        MarkCudaSyncCallsPass);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_shared_barrier.cc b/src/transform/lower_shared_barrier.cc
index 991676cb8..221cfe5bb 100644
--- a/src/transform/lower_shared_barrier.cc
+++ b/src/transform/lower_shared_barrier.cc
@@ -18,6 +18,11 @@
 namespace tvm {
 namespace tl {
 
+namespace attr {
+// BlockAttr, Recording the arrive counts for each barrier allocation
+constexpr const char *kBarrierInit = "barrier_init";
+} // namespace attr
+
 using namespace tir;
 
 class SharedBarrierRewriter : public StmtExprMutator {
@@ -43,13 +48,15 @@ class SharedBarrierRewriter : public StmtExprMutator {
       buffer_map_.insert({match_buffer->buffer->data, match_buffer->buffer});
     }
 
+    // Only check buffers allocated in THIS block, not accumulated from parent
+    // blocks
     Array<Buffer> barrier_buffers;
-
-    for (const auto &[data, buffer] : buffer_map_) {
+    for (auto buffer : alloc_buffers) {
       const auto *ptr_type =
           buffer->data->type_annotation.as<PointerTypeNode>();
+      if (!ptr_type)
+        continue;
       auto storage_scope = ptr_type->storage_scope;
-      ICHECK(ptr_type) << "Buffer Var's type annotation must be of PointerType";
       if (storage_scope == "shared.barrier") {
         barrier_buffers.push_back(buffer);
       }
@@ -62,62 +69,59 @@ class SharedBarrierRewriter : public StmtExprMutator {
     ICHECK(thread_var_.defined()) << "thread_var_ is not defined";
 
     for (auto buffer : barrier_buffers) {
+      ICHECK(buffer->name != "mbarrier")
+          << "Shared barrier's name 'mbarrier' is reserved";
       buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
 
     /*
-    Transform the barrier buffers to new allocations
-    transform:
-        data_is_ready = T.alloc_buffer((128,), "uint64", scope="shared.barrier")
-        compute_is_done = T.alloc_buffer((128,), "uint64",
+    Transform:
+        mbarrier_list = T.alloc_barrier(arrive_counts: list[int], "handle",
     scope="shared.barrier")
 
     into:
-        data_is_ready = T.alloc_buffer((1,), "uint64", scope="shared")
-        compute_is_done = T.alloc_buffer((1,), "uint64", scope="shared")
+        # This is emitted by the definition of T.alloc_barrier
+        mbarrier_list = T.alloc_buffer(len(arrive_counts), "handle",
+    scope="shared.barrier")
 
+        # This is emitted by this pass
         if tx == 0:
-          T.ptx_init_barrier_thread_count(data_is_ready[0], 128)
-          T.ptx_init_barrier_thread_count(compute_is_done[0], 128)
+          for i in range(len(arrive_counts)):
+            T.ptx_init_barrier_thread_count(mbarrier_list[i], arrive_counts[i])
     */
 
-    // 2. create new buffers
-    Array<Buffer> new_buffers;
-    for (auto buffer : barrier_buffers) {
-      auto data = buffer->data;
-      auto new_buffer = Buffer(data, buffer->dtype, Array<PrimExpr>({1}),
-                               Array<PrimExpr>({1}), PrimExpr(0), buffer->name,
-                               buffer->data_alignment, buffer->offset_factor,
-                               buffer->buffer_type);
-      new_buffers.push_back(new_buffer);
-      buffer_remap_.Set(buffer, new_buffer);
-    }
-
-    // remove the barrier buffers
-    alloc_buffers.MutateByApply([this](Buffer buf) {
-      if (buffer_remap_.find(buf) != buffer_remap_.end()) {
-        return buffer_remap_.at(buf);
-      }
-      return buf;
-    });
-    if (!alloc_buffers.same_as(op->alloc_buffers)) {
-      block.CopyOnWrite()->alloc_buffers = alloc_buffers;
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-
-    // 3. create init calls for new buffers
+    // Extract the arrive counts from the block attr "barrier_init"
+    // The attr is a Map<Var, Array<PrimExpr>> where key is buffer.data and
+    // value is arrive counts
+    ICHECK(op->annotations.count(attr::kBarrierInit))
+        << "barrier_init is not defined";
+    auto barrier_init_map = op->annotations.Get(attr::kBarrierInit)
+                                ->as<Map<Var, Array<PrimExpr>>>()
+                                .value();
+
+    // Create init calls for each barrier buffer
+    // Initialize each barrier element with its respective arrive count
     Array<Stmt> init_mbarrier_calls_;
     for (auto buffer : barrier_buffers) {
       auto data = buffer->data;
-      auto old_buffer = buffer_data_to_buffer_.at(data);
-      auto new_buffer = buffer_remap_.at(old_buffer);
-      auto count = old_buffer->shape[0];
-
-      auto call =
-          Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
-               {BufferLoad(new_buffer, {0}), PrimExpr(count)});
-      init_mbarrier_calls_.push_back(Evaluate(call));
+      ICHECK(barrier_init_map.count(data))
+          << "Barrier buffer " << buffer->name
+          << " not found in barrier_init annotation";
+      auto arrive_counts = barrier_init_map.at(data);
+      ICHECK(arrive_counts.size() ==
+             static_cast<size_t>(buffer->shape[0].as<IntImmNode>()->value))
+          << "The number of arrive counts (" << arrive_counts.size()
+          << ") must match the barrier buffer size (" << buffer->shape[0]
+          << ") for buffer " << buffer->name;
+
+      for (size_t i = 0; i < arrive_counts.size(); i++) {
+        auto call =
+            Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
+                 {BufferLoad(buffer,
+                             {IntImm(DataType::Int(32), static_cast<int>(i))}),
+                  arrive_counts[i]});
+        init_mbarrier_calls_.push_back(Evaluate(call));
+      }
     }
     if (init_mbarrier_calls_.empty())
       return block;
@@ -134,6 +138,9 @@ class SharedBarrierRewriter : public StmtExprMutator {
                                       ? init_mbarrier_calls_.back()
                                       : SeqStmt(init_mbarrier_calls_),
                                   Stmt()));
+
+    new_body.push_back(
+        Evaluate(Call(DataType::Handle(), ptx_fence_barrier_init(), {})));
     new_body.push_back(
         Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
                       {StringImm("shared")})));
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 4c0ccfafe..3db3f9aa4 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -18,8 +18,11 @@
 #include "../op/gemm.h"
 #include "../op/gemm_sp.h"
 #include "../op/operator.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
+#include "layout_reducer.h"
 #include "loop_partition.h"
 
 namespace tvm {
@@ -51,7 +54,6 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
   }
   Array<PrimExpr> layout_shape = layout->OutputShape();
   Array<PrimExpr> output_shape = layout_shape;
-
   if (ptr_type->storage_scope == "shared" ||
       ptr_type->storage_scope == "shared.dyn") {
     int replicate_extent = 1;
@@ -64,6 +66,8 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
     }
     for (size_t i = 0; i < layout_shape.size(); i++) {
       auto shape = layout_shape[i].as<IntImmNode>();
+      ICHECK(shape) << "Layout output shape must be constant integer, but got: "
+                    << layout_shape[i];
       layout_extent *= shape->value;
     }
     replicate_extent = buffer_extent / layout_extent;
@@ -198,7 +202,13 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     arith::Analyzer analyzer;
     LowerTileOpPass substituter(&analyzer);
     // Trace the buffer map for tvm_access_ptr
-    substituter.buffer_map_.insert(f->buffer_map.begin(), f->buffer_map.end());
+    // Insert both handle var and data var as keys for lookup
+    for (const auto &[param_var, buffer] : f->buffer_map) {
+      substituter.buffer_map_.insert(
+          {param_var, buffer}); // handle key (e.g., dQ_handle)
+      substituter.buffer_map_.insert(
+          {buffer->data, buffer}); // data key (e.g., dQ)
+    }
     for (const auto &[_, buffer] : f->buffer_map) {
       substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
@@ -295,7 +305,91 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         << "Invalid access ptr for permuted layout: " << access_ptr;
     auto access_ptr_call = Downcast<Call>(access_ptr);
     if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
-      LOG(FATAL) << "Transformation for tvm_access_ptr is not implemented yet";
+      // tvm_access_ptr format: (dtype, data, offset, extent, rw_mask)
+      auto buffer_var = Downcast<Var>(access_ptr_call->args[1]);
+
+      // Find original buffer from buffer_map_ using buffer_var
+      auto it = buffer_map_.find(buffer_var);
+      if (it == buffer_map_.end()) {
+        // If not found, buffer_var might be a new var after remap
+        // Do reverse lookup in var_remap_
+        for (const auto &[old_var, new_var] : var_remap_) {
+          if (new_var.same_as(buffer_var)) {
+            it = buffer_map_.find(old_var);
+            break;
+          }
+        }
+      }
+
+      if (it == buffer_map_.end()) {
+        return result; // Buffer not found, no transformation needed
+      }
+
+      Buffer original_buffer = it->second;
+
+      // Check if this buffer has a layout
+      if (!layout_map_.count(original_buffer)) {
+        return result; // No layout, no transformation needed
+      }
+
+      Layout layout = layout_map_[original_buffer];
+      Buffer new_buffer = buffer_remap_[original_buffer];
+
+      // In TMA context, swizzle is encoded in TMA descriptor parameters
+      // rather than in memory indices, so we only update buffer data
+      // without recomputing indices.
+      if (in_tma_context_) {
+        Array<PrimExpr> new_args = access_ptr_call->args;
+        new_args.Set(1, new_buffer->data); // Only replace data var
+        layout_remap_.Set(new_buffer, layout);
+        result.rewritten = true;
+        result.expr =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+        return result;
+      }
+
+      // Get the offset from tvm_access_ptr args[2]
+      PrimExpr elem_offset = access_ptr_call->args[2];
+      if (offset.defined()) {
+        elem_offset = elem_offset + offset.value();
+      }
+      // Get original and new buffer shapes
+      Array<PrimExpr> old_shape = original_buffer->shape;
+      Array<PrimExpr> new_shape = new_buffer->shape;
+      // Convert linear offset to multi-dimensional indices
+      Array<PrimExpr> multi_dim_indices;
+      PrimExpr remaining_offset = elem_offset;
+      for (int i = static_cast<int>(old_shape.size()) - 1; i >= 0; --i) {
+        multi_dim_indices.insert(
+            multi_dim_indices.begin(),
+            analyzer_->Simplify(floormod(remaining_offset, old_shape[i])));
+        remaining_offset = floordiv(remaining_offset, old_shape[i]);
+      }
+      // Apply layout transformation
+      auto forward_indices = layout->Forward(multi_dim_indices);
+      PrimExpr new_offset = 0;
+      PrimExpr stride_offset = 1;
+      for (int i = static_cast<int>(new_shape.size()) - 1; i >= 0; --i) {
+        new_offset += forward_indices[i] * stride_offset;
+        stride_offset *= new_shape[i];
+      }
+      // Verify that access is within a single tile
+      ICHECK(is_zero(analyzer_->Simplify(remaining_offset)))
+          << "Access offset exceeds tile bounds, remaining_offset: "
+          << remaining_offset;
+      new_offset = analyzer_->Simplify(new_offset);
+      Array<PrimExpr> new_indices;
+      layout_remap_.Set(new_buffer, layout);
+
+      // Build new tvm_access_ptr call with new buffer and offset
+      Array<PrimExpr> new_args = access_ptr_call->args;
+      new_args.Set(1, new_buffer->data); // Replace data var
+      new_args.Set(2, new_offset);       // Replace offset
+      result.rewritten = true;
+      result.expr = Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                         access_ptr_call->annotations, access_ptr_call->span);
+      return result;
     } else if (access_ptr_call->op.same_as(builtin::address_of())) {
       Optional<PrimExpr> resolved = ResolveBufferLoad(access_ptr_call->args[0]);
       ICHECK(resolved.defined())
@@ -304,8 +398,9 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       if (!load_expr.same_as(access_ptr_call->args[0])) {
         auto node = access_ptr_call.CopyOnWrite();
         node->args.Set(0, load_expr);
-        access_ptr_call = Call(access_ptr_call->dtype, access_ptr_call->op,
-                               {load_expr}, access_ptr_call->span);
+        access_ptr_call =
+            Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                 access_ptr_call->annotations, access_ptr_call->span);
       }
       BufferLoad load = Downcast<BufferLoad>(access_ptr_call->args[0]);
       Array<PrimExpr> indices = load->indices;
@@ -317,6 +412,31 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
           << "but got indices size: " << indices.size()
           << " and shape size: " << old_shape.size();
 
+      Buffer remap_key = FindRemapBuffer(load->buffer).value_or(load->buffer);
+      Optional<Layout> layout = FindLayout(remap_key);
+      if (!layout.defined() || !buffer_map_.count(remap_key->data)) {
+        return result;
+      }
+      auto new_buffer = buffer_remap_.count(remap_key)
+                            ? buffer_remap_[remap_key]
+                            : load->buffer;
+      auto new_shape = new_buffer->shape;
+
+      // In TMA context, swizzle is encoded in TMA descriptor parameters
+      // rather than in memory indices, so we only update buffer data
+      // without recomputing indices.
+      if (in_tma_context_) {
+        Array<PrimExpr> new_args = {BufferLoad(new_buffer, indices)};
+        if (buffer_remap_.count(remap_key)) {
+          layout_remap_.Set(new_buffer, layout.value());
+        }
+        result.rewritten = true;
+        result.expr =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+        return result;
+      }
+
       PrimExpr elem_offset = 0;
       PrimExpr stride = 1;
 
@@ -328,16 +448,6 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       PrimExpr smem_offset =
           elem_offset + (offset.defined() ? offset.value() : 0);
 
-      Buffer remap_key = FindRemapBuffer(load->buffer).value_or(load->buffer);
-      Optional<Layout> layout = FindLayout(remap_key);
-      if (!layout.defined() || !buffer_map_.count(remap_key->data)) {
-        return result;
-      }
-      auto new_buffer = buffer_remap_.count(remap_key)
-                            ? buffer_remap_[remap_key]
-                            : load->buffer;
-      auto new_shape = new_buffer->shape;
-
       auto buffer_map_iter = buffer_map_.find(Downcast<Var>(remap_key->data));
 
       int buffer_row_size = CheckAndGetBufferRowSize(buffer_map_iter->second);
@@ -375,7 +485,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       }
       result.rewritten = true;
       result.expr = Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
-                         access_ptr_call->span);
+                         access_ptr_call->annotations, access_ptr_call->span);
       return result;
     } else {
       LOG(FATAL) << "Invalid access op for permuted layout: " << access_ptr;
@@ -437,54 +547,166 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   }
 
   PrimExpr VisitExpr_(const tir::CallNode *op) final {
-    if ((!has_tma_) && (op->op.same_as(tl::tma_load()) ||
-                        op->op.same_as(tl::tma_load_im2col()) ||
-                        op->op.same_as(tl::tma_store()))) {
+    if (op->op.same_as(tl::tma_load()) ||
+        op->op.same_as(tl::tma_load_im2col()) ||
+        op->op.same_as(tl::tma_store())) {
+      // skip tma related calls, as they were transformed implicitly.
       has_tma_ = true;
-    }
-    Array<RelaxExpr> ptx_instructions = {builtin::ptx_ldmatrix(),
-                                         builtin::mma_store()};
-
-    if (std::find(ptx_instructions.begin(), ptx_instructions.end(), op->op) ==
-        ptx_instructions.end()) {
+      in_tma_context_ = true;
       auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      in_tma_context_ = false;
       return call;
-    } else {
-      is_ptx_ = true;
     }
-    // Rewrite from/to shared or shared.dyn to/from local
-    auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
-    if (call->op.same_as(builtin::ptx_ldmatrix())) {
+
+    if (is_ptx_) {
+      return Downcast<Call>(op);
+    }
+
+    // Handle ptx_ldmatrix
+    if (op->op.same_as(builtin::ptx_ldmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
       // form: T.ptx_ldmatrix(..., smem_ptr, smem_offset)
       // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
       // or T.address_of(buffer, offset)
       PrimExpr access_ptr = call->args[5];
       PrimExpr smem_offset = call->args[6];
-      Call address_of_call = Downcast<Call>(access_ptr);
-      if (!address_of_call->op.same_as(builtin::address_of())) {
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(5, new_access_ptr.expr);
+          new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              5, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[5]);
+          access_ptr = call->args[5];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(5, new_access_ptr.expr);
+          new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+        }
+      } else {
         LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
       }
-      Optional<PrimExpr> resolved = ResolveBufferLoad(address_of_call->args[0]);
-      ICHECK(resolved.defined())
-          << "Invalid address_of argument for permuted layout: "
-          << address_of_call->args[0];
-      PrimExpr load_expr = resolved.value();
-      if (!load_expr.same_as(address_of_call->args[0])) {
-        auto call_node = call.CopyOnWrite();
-        call_node->args.Set(5, Call(address_of_call->dtype, address_of_call->op,
-                                    {load_expr}, address_of_call->span));
-        address_of_call = Downcast<Call>(call->args[5]);
-        access_ptr = call->args[5];
-      }
-      BufferLoad load = Downcast<BufferLoad>(address_of_call->args[0]);
-      auto new_access_ptr =
-          HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
-      if (new_access_ptr.rewritten) {
-        auto new_call = call.CopyOnWrite();
-        new_call->args.Set(5, new_access_ptr.expr);
-        new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+      return call;
+    }
+
+    if (op->op.same_as(tl::ptx_ldmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
+      // form: T.ptx_ldmatrix(..., smem_ptr, smem_offset)
+      // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      // or T.address_of(buffer, offset)
+      PrimExpr access_ptr = call->args[2];
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              2, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[2]);
+          access_ptr = call->args[2];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else {
+        LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
+      }
+      return call;
+    }
+
+    // Handle tl::ptx_stmatrix
+    if (op->op.same_as(tl::ptx_stmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
+      // form: T.ptx_stmatrix(trans, num, smem_ptr, value0, value1, ...)
+      // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      // or T.address_of(buffer, offset)
+      PrimExpr access_ptr = call->args[2];
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              2, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[2]);
+          access_ptr = call->args[2];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else {
+        LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
       }
-    } else if (call->op.same_as(builtin::mma_store())) {
+      return call;
+    }
+
+    // Handle mma_store
+    if (op->op.same_as(builtin::mma_store())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
       // because we will directly store result to Buffer instead of calling
       // mma_store now
       auto access_ptr = call->args[2];
@@ -494,10 +716,22 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         auto new_call = call.CopyOnWrite();
         new_call->args.Set(2, new_access_ptr.expr);
       }
-    } else {
-      LOG(FATAL) << "Invalid call node: " << call;
+      return call;
     }
-    is_ptx_ = false;
+
+    // Handle standalone tvm_access_ptr calls with layout transformation
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      auto new_access_ptr =
+          HandleAccessPtrAndOffset(call, std::nullopt, call->dtype);
+      if (new_access_ptr.rewritten) {
+        return new_access_ptr.expr;
+      }
+      return call;
+    }
+
+    // Default: visit normally
+    auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
     return call;
   }
 
@@ -606,8 +840,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     if (call && call->op.as<GlobalVarNode>())
       return Downcast<Evaluate>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    auto tile_op =
-        ParseOperator(tvm::ffi::GetRef<Stmt>(op), buffer_data_to_buffer_);
+    auto tile_op = ParseOperator(tvm::ffi::GetRef<Stmt>(op));
     if (!tile_op.defined())
       return IRMutatorWithAnalyzer::VisitStmt_(op);
     AddWorkspaceCallback callback = [this](int num_elem, DataType dtype) {
@@ -639,10 +872,16 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       thread_bounds = Range::FromMinExtent(0, 1);
     }
 
-    auto lowered =
-        tile_op->Lower(LowerArgs{target_, thread_bounds, thread_var_->var,
-                                 callback, layout_map_, buffer_remap_},
-                       analyzer_);
+    // Convert let_bindings_ to Map<Var, PrimExpr> for LowerArgs
+    Map<Var, PrimExpr> let_var_to_expr;
+    for (const auto &[var, expr] : let_bindings_) {
+      let_var_to_expr.Set(var, expr);
+    }
+
+    auto lowered = tile_op->Lower(
+        LowerArgs{target_, thread_bounds, thread_var_->var, callback,
+                  layout_map_, buffer_remap_, let_var_to_expr},
+        analyzer_);
     return IRMutatorWithAnalyzer::VisitStmt(lowered);
   }
 
@@ -659,6 +898,167 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     return arith::IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
+  /**
+   * @brief Handle a Parallel For node, lowering it based on the layout
+   * annotation.
+   *
+   * This method checks if the For node has a parallel_loop_layout annotation.
+   * If the For node is a parallel loop (ForKind::kParallel):
+   * - It must have the parallel_loop_layout annotation, otherwise an error is
+   *   raised.
+   * - The loop is partitioned and vectorized based on the annotated layout.
+   * - If a predicate annotation exists, the loop is wrapped with an IfThenElse.
+   *
+   * Special handling for reducers and local buffers:
+   * - If the loop stores into local buffers, thread partitioning is skipped.
+   * - If the loop only manipulates local buffers, thread partitioning is
+   * skipped.
+   * - If reducers are present, vectorization is skipped.
+   * - Vectorization is only applied if non-local buffers or vectorizable casts
+   *   are present.
+   *
+   * @return Stmt The lowered statement.
+   */
+  Stmt VisitStmt_(const ForNode *op) final {
+    // Extract reducer info from annotations
+    Map<Var, ReducerInfo> reducer_info;
+    if (op->annotations.count(attr::kReducerInfo)) {
+      reducer_info = op->annotations.Get(attr::kReducerInfo)
+                         ->as<Map<Var, ReducerInfo>>()
+                         .value();
+    }
+
+    // First visit the body
+    For for_node = Downcast<For>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+
+    // Only process parallel loops
+    if (op->kind != ForKind::kParallel) {
+      return for_node;
+    }
+
+    // For nested parallel loops, the annotation is placed on the outermost
+    // loop. Inner parallel loops without annotation should be skipped here –
+    // they will be processed as part of the outer loop's partitioning.
+    // Rationale: inner loops cannot govern their outer loops; the outermost
+    // loop is the correct place to carry layout so we can rewrite the whole
+    // nested region in one place.
+    if (!op->annotations.count(attr::kParallelLoopLayout)) {
+      return for_node;
+    }
+
+    auto loop_layout = Downcast<Fragment>(
+        op->annotations.Get(attr::kParallelLoopLayout).value());
+
+    // Get predicate if it exists
+    Optional<PrimExpr> predicate;
+    if (op->annotations.count(attr::kParallelLoopPredicate)) {
+      predicate = Downcast<PrimExpr>(
+          op->annotations.Get(attr::kParallelLoopPredicate).value());
+    }
+
+    auto root = tvm::ffi::GetRef<For>(op);
+
+    // Check if the loop stores into local buffers.
+    // For example:
+    //   for i in T.Parallel(1024):
+    //     A_local[i] = A_global[i]
+    // Here, A_local is a register-local buffer held independently by each
+    // thread, so explicit thread binding is not required.
+    bool store_into_local = false;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (IsLocalBuffer(store->buffer)) {
+          store_into_local = true;
+        }
+      }
+    });
+
+    // Check if the loop only manipulates "local" buffers.
+    // for i in T.Parallel(1024):
+    //     A_local[i] = B_local[i]
+    // This indicates register usage and justifies skipping thread binding.
+    bool local_register_only = true;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (!IsLocalBuffer(store->buffer)) {
+          local_register_only = false;
+        }
+      } else if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (!IsLocalBuffer(load->buffer)) {
+          local_register_only = false;
+        }
+      }
+    });
+
+    // Determine if this is a true parallel loop requiring thread partitioning.
+    // Skip partitioning for loops that only operate on local/register buffers.
+    bool parallel_loop = !local_register_only && !store_into_local;
+
+    // Check if there are non-local buffer accesses (for vectorization decision)
+    bool has_non_local = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (!IsLocalBuffer(load->buffer) && !IsFragmentBuffer(load->buffer)) {
+          has_non_local = true;
+        }
+      } else if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (!IsLocalBuffer(store->buffer) && !IsFragmentBuffer(store->buffer)) {
+          has_non_local = true;
+        }
+      }
+    });
+
+    // Check if reducers are present in the loop body
+    // Workaround: if reducer is presented, don't vectorize loop
+    // Best solution should be isolate reduction axis out of vectorization
+    //
+    // Note: reducer_info stores original buffer data vars, but after visiting
+    // the body, buffers may have been remapped via var_remap_. We need to find
+    // the original var to check against reducer_info.
+    bool has_reducer = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (!has_reducer) {
+        if (const auto *store = obj.as<BufferStoreNode>()) {
+          Var data_var = store->buffer->data;
+          // Find the original var if it was remapped
+          // var_remap_ maps old_var -> new_var, so we need reverse lookup
+          Var original_var = data_var;
+          for (const auto &[old_var, new_var] : var_remap_) {
+            if (new_var.same_as(data_var)) {
+              original_var = old_var;
+              break;
+            }
+          }
+          has_reducer = reducer_info.count(original_var) != 0;
+        }
+      }
+    });
+
+    // Check if vectorizable cast operations exist
+    bool has_cast_operations = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *cast = obj.as<CastNode>()) {
+        DataType from_ty = cast->value.dtype();
+        DataType target_ty = cast->dtype;
+        if (IsCudaVectorizableCast(from_ty, target_ty) &&
+            TargetIsCuda(Target::Current())) {
+          has_cast_operations = true;
+        }
+      }
+    });
+
+    // Decide whether to vectorize:
+    // - Only if there are non-local buffers or vectorizable casts
+    // - AND no reducers are present
+    bool should_vectorize =
+        (has_non_local || has_cast_operations) && !has_reducer;
+
+    // Lower the parallel loop using the common function
+    return LowerParallelLoop(for_node, loop_layout, thread_var_->var, analyzer_,
+                             layout_map_, predicate, parallel_loop,
+                             should_vectorize);
+  }
+
   Target target_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, Layout> layout_map_;
@@ -680,6 +1080,11 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
   Map<Var, Var> var_remap_;
   bool has_tma_{false};
+  // Flag to indicate we are inside a TMA context (tma_load, tma_load_im2col,
+  // tma_store). When true, HandleAccessPtrAndOffset only updates buffer data
+  // without recomputing indices, since swizzle is encoded in TMA descriptor
+  // parameters rather than in memory indices.
+  bool in_tma_context_{false};
 };
 
 namespace transform {
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 545d2403c..0fd3a2c16 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -20,6 +20,7 @@
 /*!
  * \file make_packed_api.cc Lower PrimFunc to use the packed function API.
  */
+#include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,24 +33,24 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "../op/builtin.h"
 #include "arg_binder.h"
+#include "merge_if_stmt.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace tl {
 using namespace tir;
 using namespace ffi;
-static constexpr const char *kDeviceContextVar = "device_api_context";
 
 namespace {
 class ReturnRewriter : public StmtMutator {
 public:
-  explicit ReturnRewriter(Var ret_var, Var ret_tcode)
-      : ret_var_(std::move(ret_var)), ret_tcode_(std::move(ret_tcode)) {}
+  explicit ReturnRewriter(Var ret_var) : ret_var_(ret_var) {}
 
   Stmt VisitStmt_(const ForNode *node) override {
     if (node->kind == ForKind::kParallel)
@@ -79,8 +80,6 @@ class ReturnRewriter : public StmtMutator {
   struct ConvertedInfo {
     int type_index{-1};
     PrimExpr expr;
-    Buffer dummy_val_buffer;
-    Buffer dummy_tcode_buffer;
   };
 
   ConvertedInfo ConvertForFFI(const PrimExpr &val) {
@@ -88,7 +87,11 @@ class ReturnRewriter : public StmtMutator {
 
     // convert val's data type to FFI data type, return type code
     DataType dtype = val.dtype();
-    if (dtype.is_int() || dtype.is_uint()) {
+    if (dtype.is_bool()) {
+      info.type_index = ffi::TypeIndex::kTVMFFIBool;
+      info.expr = Cast(DataType::Int(64), val);
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
       info.expr = Cast(DataType::Int(64), val);
     } else if (dtype.is_float()) {
@@ -101,56 +104,39 @@ class ReturnRewriter : public StmtMutator {
       LOG(FATAL) << "data type " << dtype << " not supported yet";
     }
 
-    // If multiple return locations have the same data type, use the
-    // same dummy buffer declaration.
-    auto it = dummy_val_buffer_map_.find(info.type_index);
-    if (it != dummy_val_buffer_map_.end()) {
-      info.dummy_val_buffer = it->second;
-    } else {
-      info.dummy_val_buffer =
-          Buffer(ret_var_, info.expr.dtype(), {1}, {1}, ConstInt32(0),
-                 ret_var_->name_hint, 0, 0, kDefault);
-      dummy_val_buffer_map_[info.type_index] = info.dummy_val_buffer;
-    }
-
-    // The type_index is always a 32-bit int, so we don't need to have a
-    // separate map.
-    if (!dummy_tcode_buffer_.defined()) {
-      dummy_tcode_buffer_ =
-          Buffer(ret_tcode_, DataType::Int(32), {1}, {1}, ConstInt32(0),
-                 ret_tcode_->name_hint, 0, 0, kDefault);
-    }
-    info.dummy_tcode_buffer = dummy_tcode_buffer_;
-
     return info;
   }
 
-  Stmt WriteToOut(const PrimExpr &val) {
+  Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
-    Stmt store_val = BufferStore(info.dummy_val_buffer, info.expr, {0});
-    Stmt store_tcode =
-        BufferStore(info.dummy_tcode_buffer, info.type_index, {0});
+    Stmt store_tindex = tir::Evaluate(
+        tir::Call(DataType::Int(32), tir::builtin::tvm_struct_set(),
+                  {ret_var_, IntImm(DataType::Int(32), 0),
+                   IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyTypeIndex),
+                   IntImm(DataType::Int(32), info.type_index)}));
+    Stmt store_zero_padding = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyZeroPadding),
+         IntImm(DataType::Int(32), 0)}));
+    Stmt store_val = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyUnionValue),
+         info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
-    return SeqStmt({store_val, store_tcode, ret_zero});
+    return SeqStmt({store_tindex, store_zero_padding, store_val, ret_zero});
   }
 
   Var ret_var_;
-  Var ret_tcode_;
   int in_parallel_{0};
-
-  std::unordered_map<int, Buffer> dummy_val_buffer_map_;
-  Buffer dummy_tcode_buffer_;
 };
 
-Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) {
-  ReturnRewriter rewriter(std::move(ret_var), std::move(ret_tcode));
-  return rewriter(std::move(body));
-}
-
 class SubroutineCallRewriter : public StmtExprMutator {
 public:
-  static Optional<Stmt> Apply(const Map<GlobalVar, String> &packed_func_methods,
-                              Stmt stmt) {
+  static ffi::Optional<Stmt>
+  Apply(const ffi::Map<GlobalVar, ffi::String> &packed_func_methods,
+        Stmt stmt) {
     SubroutineCallRewriter rewriter(packed_func_methods);
     stmt = rewriter.VisitStmt(stmt);
     if (rewriter.made_change_) {
@@ -162,16 +148,16 @@ class SubroutineCallRewriter : public StmtExprMutator {
 
 private:
   explicit SubroutineCallRewriter(
-      const Map<GlobalVar, String> &packed_func_methods)
+      const ffi::Map<GlobalVar, ffi::String> &packed_func_methods)
       : packed_func_methods(packed_func_methods) {}
 
   PrimExpr VisitExpr_(const CallNode *op) override {
     auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
 
     if (auto *gvar_ptr = node->op.as<GlobalVarNode>()) {
-      auto gvar = tvm::ffi::GetRef<GlobalVar>(gvar_ptr);
+      auto gvar = ffi::GetRef<GlobalVar>(gvar_ptr);
       if (auto symbol = packed_func_methods.Get(gvar)) {
-        Array<PrimExpr> cpacked_args;
+        ffi::Array<PrimExpr> cpacked_args;
         cpacked_args.push_back(tir::StringImm(symbol.value()));
         for (auto arg : node->args) {
           cpacked_args.push_back(arg);
@@ -187,19 +173,18 @@ class SubroutineCallRewriter : public StmtExprMutator {
 
     return node;
   }
-  const Map<GlobalVar, String> &packed_func_methods;
+  const ffi::Map<GlobalVar, ffi::String> &packed_func_methods;
   bool made_change_{false};
 };
 
 } // namespace
 
-inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, const std::string &msg) {
-  return AssertStmt(std::move(lhs) == std::move(rhs), tvm::tir::StringImm(msg),
-                    Evaluate(0));
+inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
+  return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
-inline Stmt MakeAssertNotNull(PrimExpr ptr, const std::string &msg) {
-  Call isnull(DataType::Bool(), builtin::isnullptr(), {std::move(ptr)});
+inline Stmt MakeAssertNotNull(PrimExpr ptr, std::string msg) {
+  Call isnull(DataType::Bool(), builtin::isnullptr(), {ptr});
   return AssertStmt(!isnull, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
@@ -254,21 +239,16 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   auto *func_ptr = func.CopyOnWrite();
+  // set the global symbol to the packed function name
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
 
   // Data field definitions
   // The packed fields
+  Var v_self_handle("self_handle", DataType::Handle());
   Var v_packed_args("args", DataType::Handle());
-  Buffer buf_packed_arg_type_ids =
-      decl_buffer({IntImm(DataType::Int(32), func_ptr->params.size())},
-                  DataType::Int(32), "arg_type_ids");
   Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_out_ret_value("out_ret_value", PointerType(PrimType(DataType::Void())));
-  Var v_out_ret_tcode("out_ret_tcode",
-                      PointerType(PrimType(DataType::Int(32))));
-  Var v_resource_handle("resource_handle", DataType::Handle());
-  // The arguments of the function.
+  Var v_result("result", PointerType(PrimType(DataType::Void())));
 
   // The device context
   Var device_id("dev_id");
@@ -278,37 +258,24 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   std::vector<Stmt> seq_init, seq_check, arg_buffer_declarations;
   std::unordered_map<const VarNode *, PrimExpr> vmap;
   ArgBinder binder(&vmap);
-  std::vector<Stmt> shape_checks;
-  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-  bool disable_dynamic_tail_split =
-      ctxt->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
 
   // ---------------------------
   // local function definitions
   // load i-th argument as type t
-  auto f_arg_value = [&](DataType t, int i) {
-    Array<PrimExpr> call_args{
+  auto f_load_arg_value = [&](DataType arg_type, int i) {
+    ffi::Array<PrimExpr> call_args{
         v_packed_args, IntImm(DataType::Int(32), i),
-        IntImm(DataType::Int(32), builtin::kTVMValueContent)};
+        IntImm(DataType::Int(32), builtin::kTVMFFIAnyUnionValue)};
     // load 64 bit version
-    DataType api_type = APIType(t);
+    DataType api_type = APIType(arg_type);
     PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
     // cast to the target version.
-    if (api_type != t) {
-      res = Cast(t, res);
+    if (api_type != arg_type) {
+      res = Cast(arg_type, res);
     }
     return res;
   };
 
-  // Find the device API context argument based on name
-  for (const auto &param : func_ptr->params) {
-    if (param->name_hint == kDeviceContextVar) {
-      num_args--;
-      v_resource_handle = param;
-      break;
-    }
-  }
-
   // Assert correct type codes for each argument.  This must be done
   // *before* any initialization steps produced by
   // `binder.BindDLTensor()`.  The validity of those initialization
@@ -321,65 +288,219 @@ PrimFunc MakePackedAPI(PrimFunc func) {
         return error_message.str();
       }()));
 
-  seq_init.push_back(MakeAssertNotNull(
-      v_packed_args, name_hint + ": TVMValue* arg pointer was NULL"));
-  seq_init.push_back(MakeAssertNotNull(
-      buf_packed_arg_type_ids->data, name_hint + ": int* type_codes was NULL"));
-
-  seq_init.emplace_back(DeclBuffer(buf_packed_arg_type_ids, nop));
+  if (num_args > 0) {
+    seq_init.push_back(
+        MakeAssertNotNull(v_packed_args, name_hint + ": args pointer is NULL"));
+  }
 
   // Need to delay binding of the buffers, in case some arguments also
   // appear in the buffer.
   std::vector<std::pair<PrimExpr, Var>> var_def;
   std::vector<std::pair<Var, Buffer>> buffer_def;
 
-  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
-    Var param = func_ptr->params[i];
+  // First, collect a reverse map from Buffer->data var to parameter var so we
+  // can detect whether a buffer is actually used by the function body. In
+  // addition, collect variables that appear in the buffer's shape/stride so we
+  // can consider uses of those symbols as a use of the buffer itself.
+  std::unordered_map<const VarNode *, const VarNode *> data_var2param;
+  std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+      shape_var2params;
+  for (const auto &kv : func_ptr->buffer_map) {
+    const Var &param = kv.first;
+    const Buffer &buf = kv.second;
+    data_var2param[buf->data.get()] = param.get();
+    auto record_shape_vars = [&](const PrimExpr &e) {
+      PostOrderVisit(e, [&](const ObjectRef &n) {
+        if (const auto *v = n.as<VarNode>()) {
+          shape_var2params[v].push_back(param.get());
+        }
+      });
+    };
+    for (const PrimExpr &e : buf->shape)
+      record_shape_vars(e);
+    for (const PrimExpr &e : buf->strides)
+      record_shape_vars(e);
+    if (buf->elem_offset.defined())
+      record_shape_vars(buf->elem_offset);
+  }
 
-    // Ignore the device context argument, as it will still be passed
-    // as a native argument.
-    if (param->name_hint == kDeviceContextVar) {
-      continue;
+  // A visitor that records
+  //  - which parameter buffers are used via their data var (load/store/direct),
+  //  - which shape/stride/offset symbols are referenced in the body.
+  // Shape symbols are not immediately attributed to all carrier buffers here;
+  // a minimal carrier set is selected after visiting.
+  struct UsedBufferDetector : public StmtExprVisitor {
+    UsedBufferDetector(
+        const std::unordered_map<const VarNode *, const VarNode *> &data2param,
+        const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+            &shape2params)
+        : data2param(data2param), shape2params(shape2params) {}
+    void VisitExpr_(const VarNode *op) override {
+      auto it = data2param.find(op);
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      auto it2 = shape2params.find(op);
+      if (it2 != shape2params.end()) {
+        used_shape_vars.insert(op);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+    void VisitStmt_(const BufferStoreNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitStmt_(op);
+    }
+    void VisitExpr_(const BufferLoadNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitExpr_(op);
     }
 
-    var_def.emplace_back(f_arg_value(param.dtype(), i), param);
-    if (func_ptr->buffer_map.count(param)) {
-      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
+    const std::unordered_map<const VarNode *, const VarNode *> &data2param;
+    const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+        &shape2params;
+    std::unordered_set<const VarNode *> used_params_by_data;
+    std::unordered_set<const VarNode *> used_shape_vars;
+  };
+
+  UsedBufferDetector detector(data_var2param, shape_var2params);
+  detector(func_ptr->body);
+
+  // Build the packed argument handling. While doing so, keep track of whether
+  // each parameter buffer is actually used. Unused input buffers can be
+  // nullable and do not require DLTensor field dereferences.
+  //
+  // Start from buffers used via data-var (definitely non-NULL), then for each
+  // referenced shape symbol pick a minimal "carrier" buffer that provides the
+  // symbol. Prefer carriers that are already used-by-data; otherwise pick one
+  // arbitrary carrier to ensure the symbol is bound.
+  std::unordered_set<const VarNode *> used_param_buffers =
+      detector.used_params_by_data;
+  for (const VarNode *sym : detector.used_shape_vars) {
+    auto it = shape_var2params.find(sym);
+    if (it == shape_var2params.end())
+      continue;
+    const auto &carriers = it->second;
+    bool has_used_carrier = false;
+    for (const VarNode *p : carriers) {
+      if (used_param_buffers.count(p)) {
+        has_used_carrier = true;
+        break;
+      }
     }
+    // NOTE: With the new nullable shape binding logic in
+    // ArgBinder::BindDLTensors, we no longer need to force one carrier to be
+    // non-NULL. The binder will:
+    // 1. Assert that at least one carrier is non-NULL at runtime
+    // 2. Use cascaded if_then_else to read from the first non-NULL carrier
+    // So we can allow all carriers to be nullable.
+    // if (!has_used_carrier && !carriers.empty()) {
+    //   used_param_buffers.insert(carriers.front());
+    // }
+  }
 
-    // type code checks
-    Var type_index(param->name_hint + ".code", DataType::Int(32));
-    seq_init.emplace_back(LetStmt(
+  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
+    Var param = func_ptr->params[i];
+    PrimExpr arg_value;
+    // type index checks
+    Var type_index(param->name_hint + ".type_index", DataType::Int(32));
+    seq_init.push_back(LetStmt(
         type_index,
-        BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}),
+        tir::Call(DataType::Int(32), builtin::tvm_struct_get(),
+                  {v_packed_args, IntImm(DataType::Int(32), i),
+                   IntImm(DataType::Int(32), builtin::kTVMFFIAnyTypeIndex)}),
         nop));
-    DataType t = param.dtype();
-    if (t.is_handle()) {
+    DataType dtype = param.dtype();
+    if (dtype.is_handle()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be pointer";
+      // Prefer the Buffer name if available; otherwise, fall back to param name
+      // (trim _handle).
+      std::string display_name;
+      auto it_buf = func_ptr->buffer_map.find(param);
+      if (it_buf != func_ptr->buffer_map.end()) {
+        const auto &kv = *it_buf;
+        display_name = kv.second->data->name_hint;
+      } else {
+        display_name = param->name_hint;
+        const char *suffix = "_handle";
+        if (display_name.size() >= 7 &&
+            display_name.compare(display_name.size() - 7, 7, suffix) == 0) {
+          display_name.erase(display_name.size() - 7);
+        }
+      }
+      msg << "kernel " << name_hint << " input " << display_name
+          << " expected pointer or tensor handle";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
                          type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
                          type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
                          type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                      tvm::tir::StringImm(msg.str()), nop));
-    } else if (t.is_int() || t.is_uint()) {
+      // if type_index is Tensor, we need to add the offset of the DLTensor
+      // header which always equals 16 bytes, this ensures that T.handle always
+      // shows up as a DLTensor*
+      const int64_t object_cell_offset = sizeof(TVMFFIObject);
+      static_assert(object_cell_offset == 24);
+      arg_value = f_load_arg_value(param.dtype(), i);
+      PrimExpr handle_from_tensor =
+          Call(DataType::Handle(), tir::builtin::handle_add_byte_offset(),
+               {arg_value, IntImm(DataType::Int(32), object_cell_offset)});
+      arg_value = Select(type_index == ffi::TypeIndex::kTVMFFITensor,
+                         handle_from_tensor, arg_value);
+    } else if (dtype.is_bool()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be int";
-      seq_init.emplace_back(AssertStmt(type_index == kDLInt,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected boolean";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value =
+          Cast(DataType::Bool(), f_load_arg_value(DataType::Int(64), i));
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
+      std::ostringstream msg;
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected integer";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value = f_load_arg_value(param.dtype(), i);
     } else {
-      ICHECK(t.is_float());
+      ICHECK(dtype.is_float());
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be float";
-      seq_init.emplace_back(AssertStmt(type_index == kDLFloat,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected float";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      // use select so we can also handle int conversion to bool
+      arg_value = tir::Select(
+          type_index == ffi::TypeIndex::kTVMFFIFloat,
+          /* true_value = */ f_load_arg_value(param.dtype(), i),
+          /* false_value = */
+          Cast(param.dtype(), f_load_arg_value(DataType::Int(64), i)));
+    }
+    var_def.emplace_back(arg_value, param);
+    if (func_ptr->buffer_map.count(param)) {
+      // buffer binding now depends on type index
+      // if the index is Tensor handle, we need to offset to get the DLTensor*
+      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
     }
   }
 
-  Array<Var> args{v_packed_args,     buf_packed_arg_type_ids->data,
-                  v_num_packed_args, v_out_ret_value,
-                  v_out_ret_tcode,   v_resource_handle};
+  // signature: (void* handle, TVMFFIAny* packed_args, int num_args, TVMFFIAny*
+  // v_result)
+  ffi::Array<Var> args{v_self_handle, v_packed_args, v_num_packed_args,
+                       v_result};
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
@@ -392,83 +513,59 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     binder.Bind(param, expr, name_hint + "." + param->name_hint, true);
   }
 
-  for (const auto &kv : buffer_def) {
-    binder.BindDLTensor(kv.second, device_type, device_id, kv.first,
-                        name_hint + "." + kv.first->name_hint);
-    arg_buffer_declarations.push_back(DeclBuffer(kv.second, nop));
+  binder.BindDLTensors(buffer_def, device_type, device_id, name_hint,
+                       used_param_buffers);
+  for (const auto &[var, buffer] : buffer_def) {
+    // Prefer buffer data var name in diagnostics to avoid exposing low-level
+    // handle vars
+    arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
 
-  func =
-      WithAttrs(std::move(func),
-                {{tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)},
-                 {tvm::attr::kTarget, target_host}});
-  Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
+  // reset global symbol to attach prefix
+  func = WithAttrs(
+      std::move(func),
+      {{tvm::attr::kCallingConv, static_cast<int>(CallingConv::kCPackedFunc)},
+       {tvm::attr::kTarget, target_host},
+       {tvm::attr::kGlobalSymbol,
+        ffi::symbol::tvm_ffi_symbol_prefix + global_symbol.value()}});
+
+  Stmt body = ReturnRewriter(v_result)(func_ptr->body);
   body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::compute_scope,
                   StringImm(name_hint + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
-    auto node = String("default");
+    ffi::Any node = ffi::String("default");
     seq_check.push_back(AttrStmt(node, tir::attr::device_id, device_id, nop));
     seq_check.push_back(
         AttrStmt(node, tir::attr::device_type, device_type, nop));
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(DataType::Int(32), tir::builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device),
                          device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
   }
 
-  // (zhengju) For dynamic constraint, we need to check the buffer shape and
-  // dtype to make sure the buffer can be vectorized.
-  for (const auto &kv : buffer_def) {
-    if (disable_dynamic_tail_split) {
-      Optional<Integer> opt_dynamic_alignment =
-          ctxt->GetConfig(kDynamicAlignment, Optional<Integer>());
-      int dynamic_alignment = opt_dynamic_alignment.value_or(Integer(8))->value;
-      // The vectorize dimension will be the last dimension of the buffer
-      auto vectorize_dim = kv.second->shape[kv.second->shape.size() - 1];
-      auto shape_vectorize_expr = [&]() -> PrimExpr {
-        PrimExpr result = IntImm(kv.second->DefaultIndexType(), 1);
-        result = result * vectorize_dim;
-        result = FloorMod(result, IntImm(result->dtype, dynamic_alignment));
-        return result;
-      }();
-      shape_checks.emplace_back(AssertStmt(
-          shape_vectorize_expr == 0,
-          tvm::tir::StringImm(
-              kv.second->name +
-              ": Vectorize dimension in buffer must be divisible by " +
-              std::to_string(dynamic_alignment)),
-          nop));
-    }
-  }
-
   // Return error code of zero on success
   body = SeqStmt({body, Evaluate(ret(Integer(0)))});
 
-  if (!disable_dynamic_tail_split) {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations},
-                     body);
-  } else {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations, shape_checks},
-                     body);
-  }
-
+  body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
+                    arg_buffer_declarations},
+                   body);
   func_ptr->body = body;
   func_ptr->params = args;
 
-  Array<Var> undefined = UndefinedVars(func_ptr->body, func_ptr->params);
+  ffi::Array<Var> undefined = UndefinedVars(body, func_ptr->params);
+
   ICHECK_EQ(undefined.size(), 0)
       << "In PrimFunc " << name_hint << " variables " << undefined
       << " are used, but are not passed in as API arguments";
 
-  func_ptr->buffer_map = Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32)); // return the function.
+  func_ptr->buffer_map = ffi::Map<Var, Buffer>();
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+  // return the function.
   return func;
 }
 
@@ -498,6 +595,7 @@ tvm::transform::Pass MakePackedAPI() {
           func.CopyOnWrite()->body = body.value();
         }
         func = MakePackedAPI(std::move(func));
+        func = MergeIfStmtSubstitute(func);
 
         if (!func.same_as(orig_func)) {
           updates->Add(gvar, func);
diff --git a/src/transform/merge_if_stmt.cc b/src/transform/merge_if_stmt.cc
index 39ea3b0b7..98d9d3ac2 100644
--- a/src/transform/merge_if_stmt.cc
+++ b/src/transform/merge_if_stmt.cc
@@ -3,6 +3,8 @@
  * \brief Merge the If Stmt in SeqStmt
  */
 
+#include "merge_if_stmt.h"
+
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
@@ -20,23 +22,46 @@ using namespace tir;
 class MergeIfStmtRewriter : public StmtExprMutator {
 public:
   static PrimFunc Substitute(PrimFunc &f) {
-    auto rewriter = MergeIfStmtRewriter();
-    f.CopyOnWrite()->body = rewriter(f->body);
+    f.CopyOnWrite()->body = MergeIfStmtRewriter::Apply(f->body);
     return f;
   }
 
+  static Stmt Apply(Stmt stmt) {
+    auto rewriter = MergeIfStmtRewriter();
+    return rewriter(stmt);
+  }
+
 private:
   MergeIfStmtRewriter() = default;
 
+  void FlattenAppend(const Stmt &s, Array<Stmt> *out) {
+    if (const auto *seq = s.as<SeqStmtNode>()) {
+      for (const Stmt &e : seq->seq) {
+        FlattenAppend(e, out);
+      }
+    } else {
+      out->push_back(s);
+    }
+  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
-    Array<Stmt> new_seq;
+    // First, recursively flatten nested SeqStmt so that
+    //   SeqStmt{ if, SeqStmt{ if, SeqStmt{ if } } }
+    // becomes a single-level sequence of [if, if, if].
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : op->seq) {
+      Stmt new_stmt = this->VisitStmt(stmt);
+      FlattenAppend(new_stmt, &flat_seq);
+    }
 
+    // Then, merge consecutive IfThenElse (without else) that share the same
+    // condition.
+    Array<Stmt> new_seq;
     PrimExpr current_condition;
     Array<Stmt> current_if_bodies;
 
-    for (const Stmt &stmt : op->seq) {
-      Stmt new_stmt = this->VisitStmt(stmt);
-      if (const IfThenElseNode *if_node = new_stmt.as<IfThenElseNode>()) {
+    for (const Stmt &stmt : flat_seq) {
+      if (const auto *if_node = stmt.as<IfThenElseNode>()) {
         if (!if_node->else_case.defined()) {
           if (current_condition.defined() &&
               ExprDeepEqual()(current_condition, if_node->condition)) {
@@ -73,7 +98,7 @@ class MergeIfStmtRewriter : public StmtExprMutator {
         current_if_bodies.clear();
       }
 
-      new_seq.push_back(new_stmt);
+      new_seq.push_back(stmt);
     }
 
     if (!current_if_bodies.empty()) {
@@ -90,6 +115,12 @@ class MergeIfStmtRewriter : public StmtExprMutator {
   }
 };
 
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f) {
+  return MergeIfStmtRewriter::Substitute(f);
+}
+
+Stmt ApplyMergeIfStmt(Stmt stmt) { return MergeIfStmtRewriter::Apply(stmt); }
+
 using namespace tir::transform;
 tvm::transform::Pass MergeIfStmt() {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
diff --git a/src/transform/merge_if_stmt.h b/src/transform/merge_if_stmt.h
new file mode 100644
index 000000000..5d7a282d1
--- /dev/null
+++ b/src/transform/merge_if_stmt.h
@@ -0,0 +1,52 @@
+/*!
+ * \file merge_if_stmt.h
+ * \brief Merge consecutive If statements with the same condition
+ */
+#ifndef TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+#define TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Forward declaration
+class MergeIfStmtRewriter;
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a PrimFunc
+ *
+ * This function merges consecutive IfThenElse statements that have the same
+ * condition into a single if statement with a SeqStmt body.
+ *
+ * Example:
+ *   if (cond) { stmt1 }
+ *   if (cond) { stmt2 }
+ *   if (cond) { stmt3 }
+ *
+ * Becomes:
+ *   if (cond) {
+ *     stmt1
+ *     stmt2
+ *     stmt3
+ *   }
+ *
+ * \param f The PrimFunc to transform
+ * \return Transformed PrimFunc with merged if statements
+ */
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f);
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a statement
+ * \param stmt The statement to transform
+ * \return Transformed statement with merged if statements
+ */
+Stmt ApplyMergeIfStmt(Stmt stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
diff --git a/src/transform/merge_shared_memory_allocations.cc b/src/transform/merge_shared_memory_allocations.cc
index 55f265083..ce23a1af4 100644
--- a/src/transform/merge_shared_memory_allocations.cc
+++ b/src/transform/merge_shared_memory_allocations.cc
@@ -283,7 +283,9 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     } else if (op->attr_key == "kWarpSpecializationScope") {
       IfThenElse body = Downcast<IfThenElse>(op->body);
       this->VisitStmt(body->then_case);
-      this->VisitStmt(body->else_case.value());
+      if (body->else_case.defined()) {
+        this->VisitStmt(body->else_case.value());
+      }
     } else {
       StmtExprVisitor::VisitStmt_(op);
     }
@@ -566,29 +568,45 @@ class SharedMemoryRewriter : public StmtExprMutator {
                   {op->args[0], merged_buf_var_, extra_offset + offset, extent,
                    op->args[4]});
     } else if (op->op.same_as(builtin::ptx_cp_async())) {
-      ICHECK((op->args.size() == 5U) || (op->args.size() == 6U));
-      DataType dtype = op->dtype;
-      Var buffer = Downcast<Var>(op->args[0]);
+      ICHECK_EQ(op->args.size(), 3U)
+          << "ptx_cp_async expects 3 arguments (dst_access_ptr, "
+             "src_access_ptr, bytes)";
+
+      // Extract dst_access_ptr and check if it needs merging
+      Call dst_access_ptr = Downcast<Call>(op->args[0]);
+      ICHECK(dst_access_ptr->op.same_as(builtin::tvm_access_ptr()))
+          << "First argument must be tvm_access_ptr";
+
+      // tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      Var buffer = Downcast<Var>(dst_access_ptr->args[1]);
       if (!IsAppropriateSharedMemory(buffer)) {
         return StmtExprMutator::VisitExpr_(op);
       }
+
+      DataType dtype = op->dtype;
       PrimExpr extra_offset = GetBufferOffset(buffer, dtype);
-      PrimExpr offset = this->VisitExpr(op->args[1]);
+      PrimExpr offset = this->VisitExpr(dst_access_ptr->args[2]);
       // the dst shared memory is a byte buffer generated by merging shared
       // memory. we need to multiply the offset index by the byte size of the
       // original value dtype, to get the correct offset of merged shared
       // buffer.
       int index_factor = dtype.bytes();
-      if (op->args.size() == 5)
-        return Call(dtype, op->op,
-                    {merged_buf_var_,
-                     mul(extra_offset + offset, PrimExpr(index_factor)),
-                     op->args[2], op->args[3], op->args[4]});
-      else
-        return Call(dtype, op->op,
-                    {merged_buf_var_,
-                     mul(extra_offset + offset, PrimExpr(index_factor)),
-                     op->args[2], op->args[3], op->args[4], op->args[5]});
+
+      // Create new dst_access_ptr with merged buffer and adjusted offset
+      auto new_dst_access_ptr =
+          Call(DataType::Handle(), builtin::tvm_access_ptr(),
+               {
+                   dst_access_ptr->args[0], // ptype
+                   merged_buf_var_,         // merged buffer
+                   mul(extra_offset + offset,
+                       PrimExpr(index_factor)), // adjusted offset
+                   dst_access_ptr->args[3],     // extent
+                   dst_access_ptr->args[4]      // rw_mask
+               });
+
+      // Keep src_access_ptr and bytes unchanged
+      return Call(dtype, op->op,
+                  {new_dst_access_ptr, op->args[1], op->args[2]});
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -751,7 +769,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
                 if (lhs.size_bytes != rhs.size_bytes) {
                   return lhs.size_bytes > rhs.size_bytes;
                 }
-                return lhs.var < rhs.var;
+                // Use name comparison for deterministic ordering instead of
+                // pointer comparison
+                return lhs.var->name_hint < rhs.var->name_hint;
               });
 
     std::priority_queue<ActiveInterval, std::vector<ActiveInterval>,
@@ -1100,11 +1120,22 @@ class SharedMemoryRewriter : public StmtExprMutator {
       }
     }
 
+    // Create a sorted vector of keys from shmem_allocs_ for deterministic
+    // iteration
+    std::vector<const VarNode *> sorted_vars;
+    sorted_vars.reserve(shmem_allocs_.size());
+    for (const auto &kv : shmem_allocs_) {
+      sorted_vars.push_back(kv.first);
+    }
+    std::sort(sorted_vars.begin(), sorted_vars.end(),
+              [](const VarNode *a, const VarNode *b) {
+                return a->name_hint < b->name_hint;
+              });
+
     std::vector<BufInfo> buf_infos;
     buf_infos.reserve(shmem_allocs_.size());
     // Build a BufInfo for all allocations that participate in liveness.
-    for (const auto &kv : shmem_allocs_) {
-      const VarNode *var = kv.first;
+    for (const VarNode *var : sorted_vars) {
       auto start_it = start_index.find(var);
       if (start_it == start_index.end()) {
         continue;
@@ -1121,7 +1152,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
         info.alignment = std::max(info.alignment, align_it->second);
       }
 
-      const AllocateNode *alloc = kv.second;
+      const AllocateNode *alloc = shmem_allocs_.at(var);
       int64_t bytes_per_elem =
           static_cast<int64_t>(alloc->dtype.bytes() * alloc->dtype.lanes());
       DataType size_dtype = DataType::Int(32);
diff --git a/src/transform/multi_version_buffer_rewriter.cc b/src/transform/multi_version_buffer_rewriter.cc
index 7ed9437cf..4338842cf 100644
--- a/src/transform/multi_version_buffer_rewriter.cc
+++ b/src/transform/multi_version_buffer_rewriter.cc
@@ -15,6 +15,7 @@
 #include <utility>
 
 #include "../op/builtin.h"
+#include "../op/utils.h"
 
 namespace tvm {
 namespace tl {
@@ -30,7 +31,8 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
 
   Role GetRole(const StmtNode *stmt) const {
     auto it = map_.find(stmt);
-    ICHECK(it != map_.end());
+    ICHECK(it != map_.end())
+        << " Cannot find role for stmt: " << stmt->GetTypeKey();
     return it->second;
   }
 
@@ -48,9 +50,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
   }
 
   void VisitStmt_(const BufferStoreNode *op) final {
-    bool is_shared_store =
-        op->buffer.scope() == "shared.dyn" || op->buffer.scope() == "shared";
-    if (!is_shared_store) {
+    if (!IsSharedBuffer(op->buffer)) {
       SetRole(op, Role::kConsumer);
       return;
     }
@@ -62,7 +62,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
     auto reads = access[0];
     Role role = Role::kProducer;
     for (auto read : reads) {
-      if (read->buffer.scope() != "global") {
+      if (!IsGlobalBuffer(read->buffer)) {
         role = Role::kConsumer;
         break;
       }
@@ -110,6 +110,8 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode *op) final { HandleBodyStmt(op); }
   void VisitStmt_(const AssertStmtNode *op) final { HandleBodyStmt(op); }
   void VisitStmt_(const BlockNode *op) final { HandleBodyStmt(op); }
+  void VisitStmt_(const AllocateNode *op) final { HandleBodyStmt(op); }
+  void VisitStmt_(const DeclBufferNode *op) final { HandleBodyStmt(op); }
 
   bool HasProducer() { return has_simt_copy_ || has_bulk_copy_; }
 
@@ -190,8 +192,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
     auto is_copy_stage = [&](size_t idx) {
       bool has_shared_write = false;
       for (const BufferRegion &wr : writes[idx]) {
-        auto scope = wr->buffer.scope();
-        if (scope == "shared" || scope == "shared.dyn") {
+        if (IsSharedBuffer(wr->buffer)) {
           has_shared_write = true;
           break;
         }
@@ -199,7 +200,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       if (!has_shared_write)
         return false;
       for (const BufferRegion &rd : reads[idx]) {
-        if (rd->buffer.scope() == "global") {
+        if (IsGlobalBuffer(rd->buffer)) {
           return true;
         }
       }
@@ -361,8 +362,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       if (!in_scope)
         continue;
       // Only double-buffer shared allocations; locals do not need versioning.
-      auto scope = buffer.scope();
-      if (!(scope == "shared" || scope == "shared.dyn"))
+      if (!IsSharedBuffer(buffer))
         continue;
       if (seen.insert(buffer.get()).second) {
         scoped_buffers.push_back(buffer);
@@ -376,8 +376,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       if (map_it == block_alloc_buffers_.end())
         continue;
       for (const Buffer &buffer : map_it->second) {
-        auto scope = buffer.scope();
-        if (!(scope == "shared" || scope == "shared.dyn"))
+        if (!IsSharedBuffer(buffer))
           continue;
         if (seen.insert(buffer.get()).second) {
           scoped_buffers.push_back(buffer);
@@ -469,7 +468,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->span);
+    return Call(call->dtype, call->op, new_args, call->annotations, call->span);
   }
 
   PrimExpr version_index_;
diff --git a/src/transform/parallel_loop_layout_validator.h b/src/transform/parallel_loop_layout_validator.h
new file mode 100644
index 000000000..55af837ee
--- /dev/null
+++ b/src/transform/parallel_loop_layout_validator.h
@@ -0,0 +1,145 @@
+/*!
+ * \file parallel_loop_layout_validator.h
+ * \brief Validator for parallel loop layout annotations.
+ */
+
+#ifndef TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
+#define TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
+
+#include <tvm/tir/stmt_functor.h>
+
+#include "../layout/layout.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Count the number of consecutive nested parallel loops starting from
+ *        the given For node.
+ * \param op The outermost For node to start counting from.
+ * \return The number of consecutive nested parallel loops.
+ */
+inline int CountNestedParallelLoops(const ForNode *op) {
+  int count = 0;
+  const ForNode *current = op;
+  while (current != nullptr && current->kind == ForKind::kParallel) {
+    count++;
+    current = current->body.as<ForNode>();
+  }
+  return count;
+}
+
+/*!
+ * \brief Validator that checks parallel loop layout annotations.
+ *
+ * Rationale: in TileLang's design, inner loops cannot control their outer
+ * loops, while the outermost loop can manage its inner nested region. Hence
+ * the layout annotation is placed on the outermost parallel loop so passes
+ * can reason about and transform the whole nest from the outside.
+ *
+ * This validator checks:
+ * 1. All parallel loops must have layout annotations (either directly or via
+ *    an outer nested parallel loop).
+ * 2. For nested parallel loops, only the outermost parallel loop should have
+ *    the layout annotation.
+ * 3. The layout's InputDim must equal the number of consecutive nested
+ *    parallel loops.
+ */
+class ParallelLoopLayoutValidator : public StmtVisitor {
+public:
+  /*!
+   * \brief Validate parallel loop layout annotations in the given statement.
+   * \param stmt The statement to validate.
+   */
+  static void Validate(const Stmt &stmt) {
+    ParallelLoopLayoutValidator validator;
+    validator.VisitStmt(stmt);
+  }
+
+private:
+  void VisitStmt_(const ForNode *op) final {
+    // Only validate parallel loops
+    if (op->kind != ForKind::kParallel) {
+      StmtVisitor::VisitStmt_(op);
+      return;
+    }
+
+    // Check if this parallel loop has a layout annotation
+    bool has_layout = op->annotations.count(attr::kParallelLoopLayout) > 0;
+
+    // Count the number of consecutive nested parallel loops
+    int nested_count = CountNestedParallelLoops(op);
+
+    if (has_layout) {
+      // This is the outermost parallel loop with layout annotation
+      auto loop_layout = Downcast<Fragment>(
+          op->annotations.Get(attr::kParallelLoopLayout).value());
+
+      // Validate that layout's InputDim matches the number of nested parallel
+      // loops
+      int layout_input_dim = static_cast<int>(loop_layout->InputDim());
+      ICHECK(layout_input_dim == nested_count)
+          << "Layout InputDim mismatch for parallel loop.\n"
+          << "Expected: " << nested_count
+          << " (number of consecutive nested parallel loops)\n"
+          << "Got: " << layout_input_dim << " (layout InputDim)\n"
+          << "Loop: " << tvm::ffi::GetRef<For>(op) << "\n"
+          << "For nested parallel loops, the layout annotation should be on "
+          << "the outermost loop, and its InputDim should equal the total "
+          << "number of nested parallel loops.";
+
+      // Validate that inner parallel loops do NOT have layout annotations
+      ValidateInnerParallelLoopsNoLayout(op->body, nested_count - 1);
+
+      // Skip visiting inner parallel loops as they are part of this nested
+      // structure. Visit the body of the innermost parallel loop instead.
+      const ForNode *innermost = op;
+      for (int i = 1; i < nested_count; i++) {
+        innermost = innermost->body.as<ForNode>();
+      }
+      StmtVisitor::VisitStmt(innermost->body);
+    } else {
+      // This parallel loop doesn't have a layout annotation
+      // This is only valid if it's an inner loop of a nested parallel structure
+      // But since we process from outermost to innermost, if we reach here
+      // without a layout annotation, it's an error.
+      LOG(FATAL)
+          << "Parallel loop missing layout annotation.\n"
+          << "Loop: " << tvm::ffi::GetRef<For>(op) << "\n"
+          << "All parallel loops must have a layout annotation after "
+          << "LayoutInference pass. For nested parallel loops, the annotation "
+          << "should be on the outermost loop.";
+    }
+  }
+
+  /*!
+   * \brief Validate that inner parallel loops do not have layout annotations.
+   * \param body The body to check (should be inner parallel loops).
+   * \param remaining_count Number of remaining inner parallel loops to check.
+   */
+  void ValidateInnerParallelLoopsNoLayout(const Stmt &body,
+                                          int remaining_count) {
+    if (remaining_count <= 0) {
+      return;
+    }
+
+    const ForNode *inner_for = body.as<ForNode>();
+    ICHECK(inner_for != nullptr && inner_for->kind == ForKind::kParallel)
+        << "Expected inner parallel loop but found: " << body;
+
+    ICHECK(!inner_for->annotations.count(attr::kParallelLoopLayout))
+        << "Inner parallel loop should NOT have layout annotation.\n"
+        << "Loop: " << tvm::ffi::GetRef<For>(inner_for) << "\n"
+        << "For nested parallel loops, only the outermost parallel loop "
+        << "should have the layout annotation.";
+
+    ValidateInnerParallelLoopsNoLayout(inner_for->body, remaining_count - 1);
+  }
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
diff --git a/src/transform/plan_update_buffer_allocation_location.cc b/src/transform/plan_update_buffer_allocation_location.cc
new file mode 100644
index 000000000..995b21519
--- /dev/null
+++ b/src/transform/plan_update_buffer_allocation_location.cc
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Planning where buffers to be allocated and update the AST.
+ * \file plan_update_buffer_allocation_location.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/var.h>
+
+#include "tir/transforms/ir_utils.h"
+
+// Forward-declare tir's var-level LCA helper which has no public header.
+namespace tvm {
+namespace tir {
+ffi::Map<Var, ffi::Optional<Stmt>>
+DetectBufferVarAccessLCA(const PrimFunc &func);
+}
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::transform;
+
+// Use TVM's tir analysis API for LCA detection.
+
+class CollectManagedAllocations : public StmtExprVisitor {
+public:
+  void VisitStmt_(const BlockNode *op) final {
+    for (const auto &buf : op->alloc_buffers) {
+      managed_allocations.insert(buf->data.get());
+    }
+    for (const auto &buf : op->match_buffers) {
+      managed_allocations.insert(buf->buffer->data.get());
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief Buffers that are allocated outside of the BlockNode, and should not
+   * be moved by BufferAllocationLocator. */
+  std::unordered_set<const VarNode *> managed_allocations;
+};
+
+/*! \brief Collect the allocate buffer order. */
+class BufferAllocateOrderCollector : public StmtExprVisitor {
+public:
+  static ffi::Array<Buffer> Collect(const PrimFunc &func) {
+    BufferAllocateOrderCollector collector;
+    for (const auto &kv : func->buffer_map) {
+      collector.buffer_alloc_recorder_.push_back(kv.second);
+    }
+    collector(func->body);
+    return std::move(collector.buffer_alloc_recorder_);
+  }
+
+private:
+  bool find(const Buffer &buf) {
+    return std::find(buffer_alloc_recorder_.begin(),
+                     buffer_alloc_recorder_.end(),
+                     buf) != buffer_alloc_recorder_.end();
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    for (const Buffer &buffer : op->alloc_buffers) {
+      buffer_alloc_recorder_.push_back(buffer);
+    }
+    // Also visit match_buffers to collect constant buffers associated with
+    // AllocateConst nodes. These buffers only appear in read and match_buffer
+    // regions.
+    for (const auto &region : op->match_buffers) {
+      if (!find(region->source->buffer)) {
+        buffer_alloc_recorder_.push_back(region->source->buffer);
+      }
+    }
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief The buffer allocated order recorder. */
+  ffi::Array<Buffer> buffer_alloc_recorder_;
+};
+
+class BufferAllocationLocator : public StmtExprMutator {
+public:
+  explicit BufferAllocationLocator(const PrimFunc &func) {
+    // Use TVM's tir LCA detection implementation
+    ffi::Map<Buffer, ffi::Optional<Stmt>> buffer_lca =
+        tir::DetectBufferAccessLCA(func);
+    ffi::Map<Var, ffi::Optional<Stmt>> var_lca =
+        tir::DetectBufferVarAccessLCA(func);
+
+    // The buffer_alloc_recorder Array is used to keep the buffer allocation
+    // order since the buffer_lca Map is unordered.
+    ffi::Array<Buffer> buffer_alloc_recorder =
+        BufferAllocateOrderCollector::Collect(func);
+    std::unordered_set<const VarNode *> arg_buffer_vars;
+    CollectManagedAllocations collector;
+    collector(func->body);
+    managed_allocations_ = collector.managed_allocations;
+
+    for (const auto &kv : func->buffer_map) {
+      const Buffer &buffer = kv.second;
+      arg_buffer_vars.emplace(buffer->data.get());
+      PushBinding(buffer->data, buffer);
+    }
+    // create buffers to be allocated at each stmts
+    for (const auto &buffer : buffer_alloc_recorder) {
+      // Prefer the LCA derived from the underlying data var. If missing, fall
+      // back to Buffer LCA.
+      const StmtNode *stmt = nullptr;
+      auto vit = var_lca.find(buffer->data);
+      if (vit != var_lca.end()) {
+        stmt = (*vit).second.get();
+      } else {
+        auto bit = buffer_lca.find(buffer);
+        if (bit != buffer_lca.end()) {
+          stmt = (*bit).second.get();
+        }
+      }
+      if (stmt != nullptr || vit != var_lca.end()) {
+        if (arg_buffer_vars.count(buffer->data.get())) {
+          continue;
+        }
+        if (managed_allocations_.count(buffer->data.get())) {
+          alloc_buffers_[stmt].push_back(buffer);
+        }
+        // Do not push binding here. Bindings should reflect scope accurately,
+        // and will be pushed/popped when visiting the owning stmt.
+      }
+    }
+  }
+
+private:
+  // Maintain a stack of Buffers per data var to correctly handle cases
+  // where multiple Buffer objects share the same underlying data Var.
+  void PushBinding(const Var &v, const Buffer &buf) {
+    ffi::Array<Buffer> arr;
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it != buffer_data_to_buffers_.end()) {
+      arr = (*it).second;
+    }
+    arr.push_back(buf);
+    buffer_data_to_buffers_.Set(v, arr);
+  }
+
+  void PopBinding(const Var &v) {
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it == buffer_data_to_buffers_.end())
+      return;
+    ffi::Array<Buffer> arr = (*it).second;
+    if (!arr.empty()) {
+      // erase last element
+      std::vector<Buffer> tmp;
+      tmp.reserve(arr.size() - 1);
+      for (size_t i = 0; i + 1 < arr.size(); ++i)
+        tmp.push_back(arr[i]);
+      arr = ffi::Array<Buffer>(tmp);
+    }
+    if (arr.empty()) {
+      buffer_data_to_buffers_.erase(v);
+    } else {
+      buffer_data_to_buffers_.Set(v, arr);
+    }
+  }
+
+  bool HasBinding(const Var &v) const {
+    auto it = buffer_data_to_buffers_.find(v);
+    return it != buffer_data_to_buffers_.end() && !(*it).second.empty();
+  }
+
+  // Snapshot the current top binding per Var for APIs that require
+  // a single Buffer per data Var (e.g. GetBlockReadWriteRegion).
+  ffi::Map<Var, Buffer> SnapshotVarMap() const {
+    ffi::Map<Var, Buffer> out;
+    for (const auto &kv : buffer_data_to_buffers_) {
+      const Var &v = kv.first;
+      const ffi::Array<Buffer> &arr = kv.second;
+      if (!arr.empty()) {
+        out.Set(v, arr[arr.size() - 1]);
+      }
+    }
+    return out;
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    auto it = alloc_buffers_.find(op);
+    if (it == alloc_buffers_.end()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+    for (const Buffer &buf : it->second) {
+      PushBinding(buf->data, buf);
+    }
+    auto node = Downcast<For>(StmtMutator::VisitStmt_(op));
+    ffi::Array<Buffer> new_block_alloc_bufs;
+    for (const Buffer &buf : it->second) {
+      if (managed_allocations_.count(buf->data.get())) {
+        PopBinding(buf->data);
+        new_block_alloc_bufs.push_back(buf);
+      }
+    }
+
+    if (!new_block_alloc_bufs.empty()) {
+      node.CopyOnWrite()->body =
+          InjectOpaqueBlock(node->body, new_block_alloc_bufs);
+    }
+
+    return node;
+  }
+
+  Stmt VisitStmt_(const BlockNode *op) final {
+    ICHECK(!op->init.defined());
+    ffi::Array<Buffer> alloc_buffers;
+    auto it = alloc_buffers_.find(op);
+    if (it != alloc_buffers_.end()) {
+      alloc_buffers = it->second;
+      for (const Buffer &buf : it->second) {
+        PushBinding(buf->data, buf);
+      }
+    }
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      const Var &source_var = match_buffer->source->buffer->data;
+      ICHECK(HasBinding(source_var));
+      PushBinding(target_var, match_buffer->buffer);
+    }
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    op = stmt.as<BlockNode>();
+    ICHECK(op != nullptr);
+
+    // No longer consider buffers created by match_buffer inside the block when
+    // updating access region.
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      PopBinding(target_var);
+    }
+    // No longer consider buffers allocated inside the block when updating
+    // access region.
+    if (it != alloc_buffers_.end()) {
+      for (const Buffer &buf : it->second) {
+        PopBinding(buf->data);
+      }
+    }
+
+    ObjectPtr<BlockNode> n = CopyOnWrite(op);
+    n->alloc_buffers = std::move(alloc_buffers);
+    // Erase buffer allocated inside the block from access region.
+    n->reads = RemoveRedundantBufferRegion(n->reads);
+    n->writes = RemoveRedundantBufferRegion(n->writes);
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const BufferRealizeNode *op) final {
+    ICHECK(false)
+        << "Internal Error: BufferRealizeNode is not allowed in TensorIR.";
+    throw;
+  }
+
+  Stmt InjectOpaqueBlock(Stmt body, const ffi::Array<Buffer> &alloc_buffers) {
+    ICHECK(!alloc_buffers.empty());
+    Block opaque_block(/*iter_vars=*/{},
+                       /*reads=*/{},
+                       /*writes=*/{},
+                       /*name_hint=*/"",
+                       /*body=*/std::move(body),
+                       /*init=*/std::nullopt,
+                       /*alloc_buffers=*/alloc_buffers);
+    ObjectPtr<BlockNode> n = CopyOnWrite(opaque_block.get());
+    // Snapshot to a Var->Buffer map using the innermost binding for each Var.
+    ffi::Map<Var, Buffer> var_map = SnapshotVarMap();
+    ffi::Array<ffi::Array<BufferRegion>> access =
+        GetBlockReadWriteRegion(opaque_block, var_map);
+    n->reads = access[0];
+    n->writes = access[1];
+    BlockRealize realize({}, Bool(true), Block(n));
+    return realize;
+  }
+
+  ffi::Array<BufferRegion>
+  RemoveRedundantBufferRegion(const ffi::Array<BufferRegion> &region) const {
+    ffi::Array<BufferRegion> result;
+    for (const BufferRegion &buffer_region : region) {
+      if (HasBinding(buffer_region->buffer->data)) {
+        result.push_back(buffer_region);
+      }
+    }
+    return result;
+  }
+
+  /*! \brief The map from stmt to the buffers to be allocated under it. */
+  std::unordered_map<const StmtNode *, ffi::Array<Buffer>> alloc_buffers_;
+  /*! \brief Stack of buffers per data var for scoping correctness. */
+  ffi::Map<Var, ffi::Array<Buffer>> buffer_data_to_buffers_;
+  /*! \brief Buffers that are allocated within a BlockNode, and may be moved. */
+  std::unordered_set<const VarNode *> managed_allocations_;
+};
+
+PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  BufferAllocationLocator locator(func);
+  fptr->body = locator(fptr->body);
+  return func;
+}
+
+namespace transform {
+
+Pass PlanAndUpdateBufferAllocationLocation() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return ::tvm::tl::PlanAndUpdateBufferAllocationLocation(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0,
+                            "tl.PlanAndUpdateBufferAllocationLocation", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PlanAndUpdateBufferAllocationLocation",
+                        PlanAndUpdateBufferAllocationLocation);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index d64c7016d..10f8b3a35 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -32,6 +32,7 @@ struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
   bool propagate_knowns_to_simplify_expressions{};
   bool convert_boolean_to_and_of_ors{};
   bool apply_constraints_to_boolean_branches{};
+  bool enable_simplify_let_inline{true};
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -61,7 +62,11 @@ struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
                 "If true, simplify each branch of AND/OR under a constraints "
                 "provided by the other "
                 "branch",
-                refl::DefaultValue(false));
+                refl::DefaultValue(false))
+        .def_ro("enable_simplify_let_inline",
+                &SimplifyConfigNode::enable_simplify_let_inline,
+                "If true, inline let statements when possible",
+                refl::DefaultValue(true));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.transform.SimplifyConfig",
                                     SimplifyConfigNode, BaseAttrsNode);
@@ -240,37 +245,42 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     simplifier.MarkBufferMapShapes(func);
     func.CopyOnWrite()->body = simplifier(func->body);
 
-    // Begin to remove useless var and buffer
-    // First get used buffers
-    simplifier.used_buffers_ = CollectUsedBuffers(func);
-
-    bool param_updated = false;
-    Array<Var> new_params;
-    Map<Var, Buffer> new_buffer_map;
-    // Check whether each buffer is used
-    for (const auto &var : func->params) {
-      if (func->buffer_map.find(var) != func->buffer_map.end()) {
-        if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
-            simplifier.used_buffers_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
-        } else if (simplifier.used_in_buffer_def_.find(
-                       func->buffer_map[var]->data.get()) !=
-                   simplifier.used_in_buffer_def_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
+    // Optionally remove unused buffer parameters
+    if (simplify_arguments) {
+      // First get used buffers
+      simplifier.used_buffers_ = CollectUsedBuffers(func);
+
+      bool param_updated = false;
+      Array<Var> new_params;
+      Map<Var, Buffer> new_buffer_map;
+      // Check whether each buffer is used
+      for (const auto &var : func->params) {
+        if (func->buffer_map.find(var) != func->buffer_map.end()) {
+          if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
+              simplifier.used_buffers_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else if (simplifier.used_in_buffer_def_.find(
+                         func->buffer_map[var]->data.get()) !=
+                     simplifier.used_in_buffer_def_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else {
+            param_updated = true;
+          }
         } else {
-          param_updated = true;
+          // Non-buffer parameters (e.g., scalars) are always retained
+          new_params.push_back(var);
         }
       }
-    }
 
-    if (param_updated) {
-      return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
-                      new_buffer_map, func->attrs, func->span);
-    } else {
-      return func;
+      if (param_updated) {
+        return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
+                        new_buffer_map, func->attrs, func->span);
+      }
     }
+    // Either no change to params or argument simplification disabled
+    return func;
   }
 
 private:
@@ -307,7 +317,10 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   }
 
   Stmt VisitStmt_(const ForNode *op) final {
-    analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+    if (analyzer_->CanProve(op->extent <= 0)) {
+      // Remove loops with non-positive extent
+      return Evaluate(0);
+    }
     With<ConstraintContext> ctx1(analyzer_, op->loop_var >= op->min);
     With<ConstraintContext> ctx2(analyzer_,
                                  op->loop_var < op->min + op->extent);
@@ -315,6 +328,8 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   }
 
   bool CanInlineLetStmt(const LetStmtNode *op) {
+    if (!config_->enable_simplify_let_inline)
+      return false;
     if (is_const_number(op->value))
       return true;
     if (op->value.as<VarNode>())
@@ -460,6 +475,16 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     return std::move(store);
   }
 
+  Stmt VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == "tl.assume") {
+      PrimExpr condition = this->VisitExpr(Downcast<PrimExpr>(op->node));
+      auto n = CopyOnWrite(op);
+      n->node = std::move(condition);
+      return Parent::VisitStmt_(n.get());
+    }
+    return Parent::VisitStmt_(op);
+  }
+
 private:
   bool ArrayDeepEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs) {
     if (lhs.size() != rhs.size()) {
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index a9f52f41d..f44ad6f9d 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,28 +33,65 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "../op/builtin.h"
+#include "common/assume.h"
 #include "tir/analysis/var_use_def_analysis.h"
+#include "tvm/node/cast.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/tir/stmt.h"
 
 namespace tvm {
 namespace tl {
 using namespace ffi;
 namespace tir = tvm::tir;
 
+// This pass traverses the AST, split the target function into host part and
+// device part and copies all assume attribute statements to the device side.
+
+// 1. Traverse AST and collect all assume statements into host_assumes_.
+// 2. Until the first AttrStmtNode with tvm::attr::kTarget.
+// 3. Call SplitDeviceFunc, which will create a new device function and replace
+//    the original body with a call to that function.
 class HostDeviceSplitter : public tir::StmtMutator {
 public:
   explicit HostDeviceSplitter(IRModule *device_mod,
                               std::function<GlobalVar()> var_supply)
       : device_mod_(device_mod), var_supply_(std::move(var_supply)) {}
 
+  void SetNonRestrictParams(Optional<Array<tir::Var>> params) {
+    for (auto param : params.value()) {
+      non_restrict_params_.push_back(param);
+    }
+  }
+
   tir::Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       found_device_region_ = true;
       auto device_target = op->node.as<tvm::Target>().value().WithoutHost();
       return SplitDeviceFunc(op->body, device_target);
+    } else if (op->attr_key == tir::attr::tilelang_assume) {
+      // NOTE(chaofan): the assumes collected here must be in host-side.
+      //    This is because when the collector reaches the split region,
+      //    it will start to split and return. For safety, we add a check here.
+      ICHECK(!found_device_region_)
+          << "Assumes collection should not be in device region.";
+      // We first push back the outside assume, then visit the child.
+      // So when moving assumes to device side, we need to do the building
+      // process in a reverse order.
+      host_assumes_.push_back(op);
     }
     return tir::StmtMutator::VisitStmt_(op);
   }
 
+  tir::Stmt VisitStmt_(const tir::EvaluateNode *op) final {
+    auto stmt = GetRef<tir::Stmt>(op);
+    // There should be no assume in evaluate form after InjectAssumes.
+    ICHECK(!IsAssumeInEvaluateForm(stmt))
+        << "Unexpected assume in evaluate form. Please run InjectAssumes pass "
+           "first.";
+    return tir::StmtMutator::VisitStmt_(op);
+  }
+
   tir::Stmt ForceSplit(tir::Stmt body, tvm::Target device_target) {
     return SplitDeviceFunc(std::move(body), std::move(device_target));
   }
@@ -63,9 +100,43 @@ class HostDeviceSplitter : public tir::StmtMutator {
 
 private:
   bool found_device_region_{false};
+  Array<tir::Var> non_restrict_params_;
+
+  // Wrap body with assumes, substituting variables in assumes with the
+  // corresponding variables in the device body based on name_hint matching.
+  // This substitution is necessary because host-side assume variables may be
+  // different Var objects from device-side parameters, even if they have the
+  // same name. We always perform substitution to ensure ConvertSSA sees
+  // consistent variable references.
+  Stmt wrapBodyWithHostSideAssumes(
+      Stmt body, const std::unordered_map<std::string, tir::Var> &name_to_var) {
+    // Build substitution map: assume_var -> body_var
+    // Always substitute if we find a matching name, regardless of whether
+    // it's the same object. This ensures ConvertSSA treats them as the same
+    // variable.
+    auto substitute_func =
+        [&name_to_var](const tir::Var &var) -> Optional<PrimExpr> {
+      auto it = name_to_var.find(var->name_hint);
+      if (it != name_to_var.end()) {
+        return it->second;
+      }
+      return Optional<PrimExpr>();
+    };
+
+    for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
+      // Substitute variables in the assume condition
+      PrimExpr original_node = Downcast<PrimExpr>((*it)->node);
+      PrimExpr substituted_node =
+          tir::Substitute(original_node, substitute_func);
+      body = AttrStmt(substituted_node, tir::attr::tilelang_assume,
+                      (*it)->value, body);
+    }
+    return body;
+  }
 
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
-    auto [params, buffers_to_declare] =
+    // First, analyze undefined variables in the device body
+    auto [old_params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
       tir::VarUseDefAnalyzer use_def(/*defined_vars=*/{},
                                      /*visit_thread_extent=*/true);
@@ -87,6 +158,41 @@ class HostDeviceSplitter : public tir::StmtMutator {
       return {params, use_def.undefined_buffers_};
     }();
 
+    // Create new parameter variables for the device function to avoid sharing
+    // Var objects with the host function. This prevents ConvertSSA from
+    // incorrectly renaming variables when it processes multiple functions.
+    Array<tir::Var> params;
+    Map<tir::Var, PrimExpr> var_remap;
+    std::unordered_map<std::string, tir::Var> name_to_var;
+    for (const auto &old_var : old_params) {
+      tir::Var new_var(old_var->name_hint, old_var->type_annotation);
+      params.push_back(new_var);
+      var_remap.Set(old_var, new_var);
+      name_to_var[old_var->name_hint] = new_var;
+    }
+
+    // Substitute old variables with new ones in the body
+    body = tir::Substitute(body, var_remap);
+
+    // Also remap buffers to use new variables
+    Array<tir::Buffer> new_buffers_to_declare;
+    for (const auto &buf : buffers_to_declare) {
+      auto new_shape = buf->shape.Map(
+          [&](const PrimExpr &e) { return tir::Substitute(e, var_remap); });
+      auto new_strides = buf->strides.Map(
+          [&](const PrimExpr &e) { return tir::Substitute(e, var_remap); });
+      auto new_elem_offset = tir::Substitute(buf->elem_offset, var_remap);
+      auto new_data = var_remap.count(buf->data)
+                          ? Downcast<tir::Var>(var_remap[buf->data])
+                          : buf->data;
+      tir::Buffer new_buf(new_data, buf->dtype, new_shape, new_strides,
+                          new_elem_offset, buf->name, buf->data_alignment,
+                          buf->offset_factor, buf->buffer_type,
+                          buf->axis_separators, buf->span);
+      new_buffers_to_declare.push_back(new_buf);
+    }
+    buffers_to_declare = new_buffers_to_declare;
+
     // CodeGenCPU is used for some device-side targets, such as
     // "ext_dev", and expects to be able to return a int32_t status
     // code.
@@ -104,19 +210,41 @@ class HostDeviceSplitter : public tir::StmtMutator {
       kernel_ret_type = VoidType();
     }
 
+    // Declare necessary buffers for the device side.
     for (tir::Buffer buf : buffers_to_declare) {
       body = tir::DeclBuffer(buf, std::move(body));
     }
+
+    // Copy assumes from host-side to device-side, with variable substitution.
+    // This must be done after DeclBuffer so that assumes are at the outermost
+    // level of the function body. This ensures ConvertSSA correctly identifies
+    // that assume variables refer to function parameters.
+    body = wrapBodyWithHostSideAssumes(body, name_to_var);
+
+    // Remap non_restrict_params to use new parameter variables
+    Array<tir::Var> remapped_non_restrict_params;
+    for (const auto &old_var : non_restrict_params_) {
+      if (var_remap.count(old_var)) {
+        remapped_non_restrict_params.push_back(
+            Downcast<tir::Var>(var_remap[old_var]));
+      } else {
+        remapped_non_restrict_params.push_back(old_var);
+      }
+    }
+
     tir::PrimFunc device_func(params, body, kernel_ret_type);
-    device_func =
-        WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target},
-                                           {tir::attr::kNoAlias, true},
-                                           {tir::attr::kIsGlobalFunc, true}});
+    device_func = WithAttrs(
+        std::move(device_func),
+        {{tvm::attr::kTarget, device_target},
+         {tir::attr::kNoAlias, true},
+         {tir::attr::kIsGlobalFunc, true},
+         {tl::attr::kNonRestrictParams, remapped_non_restrict_params}});
 
     GlobalVar kernel_symbol_global = var_supply_();
     (*device_mod_)->Add(kernel_symbol_global, device_func);
+    // Use old_params as call arguments (host-side variables)
     Array<PrimExpr> args =
-        params.Map([](const tir::Var &var) -> PrimExpr { return var; });
+        old_params.Map([](const tir::Var &var) -> PrimExpr { return var; });
 
     if (can_propagate_errors) {
       tir::Var kernel_error_code("kernel_error_code", success->dtype);
@@ -138,11 +266,20 @@ class HostDeviceSplitter : public tir::StmtMutator {
   IRModule *device_mod_;
   // Generate new GlobalVar for the kernel
   std::function<GlobalVar()> var_supply_;
+  // Collect assumes in host side
+  Array<const tir::AttrStmtNode *> host_assumes_;
 };
 
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
                               std::function<GlobalVar()> var_supply) {
   HostDeviceSplitter splitter(device_mod, std::move(var_supply));
+  // Propagate non-restrict parameter list from host func to device kernels
+  if (auto opt = func->GetAttr<Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    splitter.SetNonRestrictParams(opt.value());
+    // Remove the attribute from host-side PrimFunc; it only matters for device
+    // codegen.
+    func = tvm::WithoutAttr(std::move(func), tl::attr::kNonRestrictParams);
+  }
 
   if (auto body = splitter(func->body); !body.same_as(func->body)) {
     func.CopyOnWrite()->body = body;
@@ -159,7 +296,6 @@ tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
       }
     }
   }
-
   return func;
 }
 
@@ -190,7 +326,6 @@ tvm::transform::Pass SplitHostDevice() {
         }
       }
     }
-
     mod->Update(updates);
     mod->Update(device_mod);
     return tir::transform::ConvertSSA()(mod);
diff --git a/src/transform/storage_access.cc b/src/transform/storage_access.cc
deleted file mode 100644
index 49c839929..000000000
--- a/src/transform/storage_access.cc
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file storage_access.cc
- */
-#include "storage_access.h"
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/target/target_info.h>
-#include <tvm/tir/op.h>
-
-#include <string>
-#include <utility>
-
-#include "../op/builtin.h"
-#include "tir/transforms/ir_utils.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-
-void TileLangStorageAccessVisitor::VisitExpr_(const BufferLoadNode *op) {
-  Var buf = op->buffer->data;
-  buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
-  StorageScope scope = GetScope(buf);
-  if (Enabled(buf.get(), scope)) {
-    ICHECK(allow_append_) << tvm::ffi::GetRef<BufferLoad>(op) << " "
-                          << scope.to_string();
-    AccessEntry e;
-    e.threads = env_threads();
-    e.thread_range = this->ComputeThreadRange(e.threads);
-    e.buffer = buf;
-    e.buffer_indices = op->indices;
-    e.dtype = op->dtype.element_of();
-    for (const auto &index : op->indices) {
-      e.touched.push_back(arith::IntSet::Vector(index));
-    }
-    e.type = kRead;
-    e.scope = scope;
-    curr_stmt_.access.emplace_back(std::move(e));
-  }
-  // traverse child
-  IRVisitorWithAnalyzer::VisitExpr_(op);
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const BufferStoreNode *op) {
-  allow_append_ = true;
-  ICHECK_EQ(curr_stmt_.access.size(), 0U);
-  curr_stmt_.stmt = op;
-
-  Var buf = op->buffer->data;
-  buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
-  StorageScope scope = GetScope(buf);
-  if (Enabled(buf.get(), scope)) {
-    AccessEntry e;
-    e.threads = env_threads();
-    e.thread_range = this->ComputeThreadRange(e.threads);
-    e.buffer = buf;
-    e.buffer_indices = op->indices;
-    e.dtype = op->value.dtype().element_of();
-    for (const auto &index : op->indices) {
-      e.touched.push_back(arith::IntSet::Vector(index));
-    }
-    e.type = kWrite;
-    e.scope = scope;
-    curr_stmt_.access.emplace_back(std::move(e));
-  }
-  // traverse child
-  IRVisitorWithAnalyzer::VisitStmt_(op);
-  // push to the scope
-  scope_.back().push_back(curr_stmt_);
-  // clear access entry.
-  curr_stmt_.access.clear();
-  allow_append_ = false;
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const EvaluateNode *op) {
-  allow_append_ = true;
-  ICHECK_EQ(curr_stmt_.access.size(), 0U);
-  curr_stmt_.stmt = op;
-  IRVisitorWithAnalyzer::VisitStmt_(op);
-  // push to the scope
-  if (!curr_stmt_.access.empty()) {
-    scope_.back().push_back(curr_stmt_);
-    curr_stmt_.access.clear();
-  }
-  allow_append_ = false;
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const LetStmtNode *op) {
-  allow_append_ = true;
-  ICHECK_EQ(curr_stmt_.access.size(), 0U);
-  curr_stmt_.stmt = op;
-  this->VisitExpr(op->value);
-  // push to the scope
-  scope_.back().push_back(curr_stmt_);
-  // clear access entry.
-  curr_stmt_.access.clear();
-  allow_append_ = false;
-  // traverse body block
-  this->VisitStmt(op->body);
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const BlockNode *op) {
-  auto block = Downcast<Block>(op);
-  for (const auto &buffer : block->alloc_buffers) {
-    ICHECK(buffer->IsInstance<BufferNode>());
-    buffer_data_to_buffer_.Set(buffer->data, buffer);
-  }
-  IRVisitorWithAnalyzer::VisitStmt_(op);
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const AttrStmtNode *op) {
-  if (op->attr_key == tvm::tir::attr::double_buffer_write) {
-    ICHECK(double_buffer_write_ == nullptr);
-    double_buffer_write_ = op->node.as<VarNode>();
-    scope_.push_back(std::vector<StmtEntry>());
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-    StmtEntry s;
-    s.stmt = op;
-    s.access = Summarize(std::move(scope_.back()), nullptr);
-    scope_.pop_back();
-    if (!s.access.empty()) {
-      for (AccessEntry &e : s.access) {
-        if (e.type == kWrite && e.buffer.get() == double_buffer_write_) {
-          e.double_buffer_write = true;
-        }
-      }
-      scope_.back().emplace_back(std::move(s));
-    }
-    double_buffer_write_ = nullptr;
-  } else if (op->attr_key == tvm::tir::attr::coproc_scope) {
-    IterVar iv = Downcast<IterVar>(op->node);
-    env_threads_.push_back(iv);
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-    env_threads_.pop_back();
-  } else if (op->attr_key == tvm::tir::attr::thread_extent) {
-    IterVar iv = Downcast<IterVar>(op->node);
-    env_threads_.push_back(iv);
-    ICHECK_NE(iv->thread_tag.length(), 0U);
-    analyzer_.Bind(
-        iv->var, Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value));
-
-    if (!in_device_env_) {
-      in_device_env_ = true;
-      scope_.push_back(std::vector<StmtEntry>());
-      IRVisitorWithAnalyzer::VisitStmt_(op);
-      // no need to take the result as the thread barrier automatically syncs.
-      Summarize(std::move(scope_.back()), nullptr);
-      in_device_env_ = false;
-      scope_.pop_back();
-    } else {
-      IRVisitorWithAnalyzer::VisitStmt_(op);
-    }
-    env_threads_.pop_back();
-  } else if (op->attr_key == tvm::tir::attr::hand_threaded) {
-    // skip this pass on blocks that were hand_threaded
-    // this avoids control flow and read/write conflicts
-    // between hand-threaded kernels and automatic threading
-  } else {
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-  }
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const ForNode *op) {
-  scope_.push_back(std::vector<StmtEntry>());
-  IRVisitorWithAnalyzer::VisitStmt_(op);
-  StmtEntry s;
-  s.stmt = op;
-  s.access = Summarize(std::move(scope_.back()), op);
-  scope_.pop_back();
-  if (!s.access.empty()) {
-    // relax the touched set to contain all ranges in the loop.
-    std::unordered_map<const VarNode *, arith::IntSet> relax_map;
-    relax_map[op->loop_var.get()] =
-        arith::IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
-    for (AccessEntry &e : s.access) {
-      if (e.buffer.defined()) {
-        ICHECK(!e.touched.empty());
-        Array<arith::IntSet> new_touched;
-        for (const auto &touched : e.touched) {
-          new_touched.push_back(arith::EvalSet(touched, relax_map));
-        }
-        e.touched = std::move(new_touched);
-      }
-    }
-  }
-  if (!s.access.empty()) {
-    scope_.back().emplace_back(std::move(s));
-  }
-}
-
-bool IsThreadInvariant(const PrimExpr &cond) {
-  if (auto call = cond.as<CallNode>()) {
-    if (auto opt_call_op = call->op.as<Op>()) {
-      const auto &call_op = opt_call_op.value();
-      if (call_op.same_as(builtin::tvm_thread_invariant())) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/**
- * @brief Visit an IfThenElse statement and collect storage access summaries for
- * its branches.
- *
- * Visits the if-then-else node's condition and both branches to summarize
- * buffer reads, writes, and synchronization events under the condition's
- * constraints. If the condition is not thread-invariant, increments an internal
- * condition counter for the duration of processing.
- *
- * Behavior and side effects:
- * - Evaluates the condition expression (using ExtractRealCondition) and applies
- * it as a constraint while summarizing the then-branch.
- * - For the else-branch (when present), applies the negated,
- * analyzer-simplified condition
- *   (analyzer_.rewrite_simplify(Not(real_condition))) as the constraint.
- * - Accumulates summarized StmtEntry access information for the then/else
- * branches and appends a combined StmtEntry for the IfThenElseNode into the
- * current scope.
- * - Temporarily toggles allow_append_ and clears curr_stmt_.access during
- * condition evaluation and branch summarization.
- * - Modifies internal state: scope_ (push/pop of temporary branch scopes),
- * curr_stmt_.access, and condition_counter_ (incremented/decremented when the
- * condition is not thread-invariant).
- */
-void TileLangStorageAccessVisitor::VisitStmt_(const IfThenElseNode *op) {
-  bool is_thread_invariant = IsThreadInvariant(op->condition);
-  if (!is_thread_invariant) {
-    ++condition_counter_;
-  }
-
-  allow_append_ = true;
-  this->VisitExpr(op->condition);
-  PrimExpr real_condition = ExtractRealCondition(op->condition);
-
-  // Preserve accesses collected from the condition expression so they
-  // participate in dependency analysis. Otherwise, a write to shared memory
-  // immediately followed by an if-condition reading that memory would not
-  // trigger a sync before the if-statement.
-  std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
-  allow_append_ = false;
-
-  scope_.push_back(std::vector<StmtEntry>());
-  {
-    With<arith::ConstraintContext> constraint(&analyzer_, real_condition);
-    this->VisitStmt(op->then_case);
-  }
-
-  StmtEntry s;
-  s.stmt = op;
-  s.access = Summarize(std::move(scope_.back()), nullptr);
-  scope_.pop_back();
-  // Merge the condition's access summary into the if-statement's access list
-  // so the planner can insert a sync before the if when necessary.
-  if (!cond_access.empty()) {
-    s.access.insert(s.access.begin(), cond_access.begin(), cond_access.end());
-  }
-  if (op->else_case) {
-    scope_.push_back(std::vector<StmtEntry>());
-    {
-      With<arith::ConstraintContext> constraint(
-          &analyzer_, analyzer_.rewrite_simplify(Not(real_condition)));
-      this->VisitStmt(op->else_case.value());
-    }
-    auto v = Summarize(std::move(scope_.back()), nullptr);
-    scope_.pop_back();
-    s.access.insert(s.access.end(), v.begin(), v.end());
-  }
-  scope_.back().emplace_back(std::move(s));
-  if (!is_thread_invariant) {
-    --condition_counter_;
-  }
-}
-
-void TileLangStorageAccessVisitor::VisitStmt_(const WhileNode *op) {
-  bool is_thread_invariant = IsThreadInvariant(op->condition);
-  if (!is_thread_invariant) {
-    ++condition_counter_;
-  }
-  this->VisitExpr(op->condition);
-  scope_.push_back(std::vector<StmtEntry>());
-  this->VisitStmt(op->body);
-  StmtEntry s;
-  s.stmt = op;
-  s.access = Summarize(std::move(scope_.back()), nullptr);
-  scope_.pop_back();
-  scope_.back().emplace_back(std::move(s));
-  if (!is_thread_invariant) {
-    --condition_counter_;
-  }
-}
-
-void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
-  // Mark async TMA load context so that tvm_access_ptr within the call
-  // can be tagged accordingly.
-  auto is_tma_load = [&]() {
-    if (auto opt = op->op.as<Op>()) {
-      const Op &call_op = opt.value();
-      return call_op.same_as(tl::tma_load()) ||
-             call_op.same_as(tl::tma_load_im2col());
-    }
-    return false;
-  }();
-  if (is_tma_load) {
-    tma_depth_++;
-    for (const auto &a : op->args) {
-      this->VisitExpr(a);
-    }
-    tma_depth_--;
-    return;
-  }
-  if (op->op.same_as(builtin::address_of())) {
-    ICHECK_EQ(op->args.size(), 1U);
-    if (auto load = op->args[0].as<BufferLoadNode>()) {
-      Buffer buffer = load->buffer;
-      DataType dtype = buffer->dtype;
-      const VarNode *buffer_var = buffer->data.as<VarNode>();
-      buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buffer_var), buffer);
-      StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
-      Array<Range> buffer_ranges;
-      // from indices to buffer indices
-      ICHECK(buffer->shape.size() == load->indices.size());
-      // Use buffer shape and indices to compute the buffer_ranges for each
-      // dimension.
-      for (size_t i = 0; i < buffer->shape.size(); ++i) {
-        PrimExpr min = load->indices[i];
-        PrimExpr extent = make_const(buffer->shape[i].dtype(), 1);
-        buffer_ranges.push_back(Range::FromMinExtent(min, extent));
-      }
-      if (Enabled(buffer_var, scope)) {
-        ICHECK(allow_append_);
-        AccessEntry e;
-        e.threads = env_threads();
-        e.thread_range = this->ComputeThreadRange(e.threads);
-        e.dtype = dtype;
-        e.buffer = Downcast<Var>(buffer->data);
-        e.buffer_ranges = buffer_ranges;
-        for (const auto &index : load->indices) {
-          e.touched.push_back(arith::IntSet::Vector(index));
-        }
-        e.is_pointer_access = true;
-        e.type = kRead;
-        e.scope = scope;
-        curr_stmt_.access.emplace_back(e);
-      }
-      IRVisitorWithAnalyzer::VisitExpr_(load);
-    } else {
-      IRVisitorWithAnalyzer::VisitExpr_(op);
-    }
-  } else if (op->op.same_as(builtin::tvm_access_ptr())) {
-    ICHECK_EQ(op->args.size(), 5U);
-    DataType dtype = op->args[0].dtype();
-    const VarNode *buffer_var = op->args[1].as<VarNode>();
-    PrimExpr offset = op->args[2];
-    PrimExpr extent = op->args[3];
-    const IntImmNode *flag = op->args[4].as<IntImmNode>();
-    StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
-    // The buffer scope.
-    if (Enabled(buffer_var, scope)) {
-      ICHECK(allow_append_);
-      Array<Range> buffer_ranges;
-      if (buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(buffer_var)) ==
-          buffer_data_to_buffer_.end()) {
-        // cannot find buffer map, use the default buffer
-        buffer_ranges = {Range::FromMinExtent(offset, extent)};
-      } else {
-        Buffer buffer =
-            buffer_data_to_buffer_.at(tvm::ffi::GetRef<Var>(buffer_var));
-        auto buffer_shape = buffer->shape;
-        // convert 1d offset to multi-dimensional index
-        auto linear_to_indices = [this](PrimExpr offset,
-                                        const Array<PrimExpr> &shape) {
-          Array<PrimExpr> indices;
-          PrimExpr remaining = std::move(offset);
-          for (size_t i = 0; i < shape.size(); ++i) {
-            PrimExpr stride = make_const(DataType::Int(32), 1);
-            for (size_t j = i + 1; j < shape.size(); ++j) {
-              stride = stride * shape[j];
-            }
-            PrimExpr idx = FloorDiv(remaining, stride);
-            remaining = FloorMod(remaining, stride);
-            indices.push_back(analyzer_.Simplify(idx));
-          }
-          return indices;
-        };
-        Array<PrimExpr> start_indices = linear_to_indices(offset, buffer_shape);
-        Array<PrimExpr> end_indices =
-            linear_to_indices(offset + extent, buffer_shape);
-        for (size_t i = 0; i < buffer_shape.size(); ++i) {
-          buffer_ranges.push_back(Range::FromMinExtent(
-              start_indices[i],
-              analyzer_.Simplify(end_indices[i] - start_indices[i])));
-        }
-      }
-      AccessEntry e;
-      e.threads = env_threads();
-      e.thread_range = this->ComputeThreadRange(e.threads);
-      e.dtype = dtype;
-      e.buffer = tvm::ffi::GetRef<Var>(buffer_var);
-      e.buffer_ranges = buffer_ranges;
-      e.is_pointer_access = true;
-      e.touched = {
-          arith::IntSet::FromRange(Range::FromMinExtent(offset, extent))};
-      e.scope = scope;
-      if (flag->value & 1) {
-        e.type = kRead;
-        e.is_async_copy = (tma_depth_ > 0);
-        curr_stmt_.access.emplace_back(e);
-      }
-      if (flag->value & 2) {
-        e.type = kWrite;
-        e.is_async_copy = (tma_depth_ > 0);
-        curr_stmt_.access.emplace_back(e);
-      }
-    }
-    IRVisitorWithAnalyzer::VisitExpr_(op);
-  } else if (op->op.same_as(builtin::tvm_storage_sync())) {
-    ICHECK(allow_append_);
-    const std::string &s = op->args[0].as<StringImmNode>()->value;
-    if (s != "warp") {
-      StorageScope scope = StorageScope::Create(s);
-      AccessEntry e;
-      e.threads = env_threads();
-      e.thread_range = this->ComputeThreadRange(e.threads);
-      e.type = kSync;
-      e.scope = StorageScope::Create(s);
-      curr_stmt_.access.emplace_back(std::move(e));
-    }
-  } else {
-    IRVisitorWithAnalyzer::VisitExpr_(op);
-  }
-}
-
-Map<Var, Range> TileLangStorageAccessVisitor::ComputeThreadRange(
-    const Array<IterVar> &threads) {
-  Map<Var, Range> thread_range;
-  for (const auto &th : threads) {
-    auto thread_tag = th->thread_tag;
-    if (thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
-        thread_tag == "threadIdx.z") {
-      auto const_int_bound = analyzer_.const_int_bound(th->var);
-      auto min_value = const_int_bound->min_value;
-      auto max_value = const_int_bound->max_value;
-      auto extent = max_value - min_value + 1;
-      auto dtype = th->var.dtype();
-      thread_range.Set(th->var, Range::FromMinExtent(IntImm(dtype, min_value),
-                                                     IntImm(dtype, extent)));
-    }
-  }
-  return thread_range;
-}
-
-StorageScope
-TileLangStorageAccessVisitor::GetScope(const Var &buffer_var) const {
-  if (buffer_var->type_annotation.as<PointerTypeNode>()) {
-    return StorageScope::Create(GetPtrStorageScope(buffer_var));
-  }
-  return StorageScope(); // global by default
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/storage_access.h b/src/transform/storage_access.h
deleted file mode 100644
index 54114ace2..000000000
--- a/src/transform/storage_access.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file storage_access.h
- * \brief Common data structure for storage access analysis.
- */
-#ifndef TVM_TIR_TRANSFORMS_STORAGE_ACCESS_H_
-#define TVM_TIR_TRANSFORMS_STORAGE_ACCESS_H_
-
-#include <tvm/arith/int_set.h>
-#include <tvm/ir/attrs.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <unordered_map>
-#include <vector>
-
-#include "arith/ir_visitor_with_analyzer.h"
-#include "runtime/thread_storage_scope.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using namespace ffi;
-using arith::IRVisitorWithAnalyzer;
-using runtime::StorageRank;
-using runtime::StorageScope;
-
-/*!
- * \brief Base class of storage access analysis
- */
-class TileLangStorageAccessVisitor : public IRVisitorWithAnalyzer {
-public:
-  /*! \brief Storage access type */
-  enum AccessType : uint8_t {
-    kRead,
-    kWrite,
-    kSync,
-    kAlloc,
-    // acquired version of read, only need to handle WAR dep.
-    kReadAcquire
-  };
-  /*! \brief An access entry */
-  struct AccessEntry {
-    /*! \brief The thread index that access this entry */
-    Array<IterVar> threads;
-    /*! \brief The touched thread range */
-    Map<Var, Range> thread_range;
-    /*! \brief The buffer variable, if any */
-    Array<PrimExpr> buffer_indices;
-    /*! \brief The buffer ranges for pointer access */
-    Array<Range> buffer_ranges;
-    Var buffer = NullValue<Var>();
-    /*! \brief The access data type */
-    DataType dtype;
-    /*! \brief The touched access range
-     *
-     * Has one IntSet for each index in the buffer being accessed.
-     */
-    Array<arith::IntSet> touched;
-    /*! \brief The type of access */
-    AccessType type;
-    /*! \brief The storage scope */
-    StorageScope scope;
-    /*! \brief Whether the access is double buffer write */
-    bool double_buffer_write = false;
-    /*! \brief Whether the access is pointer access */
-    bool is_pointer_access = false;
-    /*! \brief Whether this access originates from an async copy context
-     *         (e.g., inside a TMA load) and therefore multiple writes
-     *         among themselves should not force barriers between them. */
-    bool is_async_copy = false;
-  };
-
-  /*! \brief Access pattern about a single statement */
-  struct StmtEntry {
-    /*! \brief The statement */
-    const Object *stmt{};
-    /*! \brief access patterns in the statement */
-    std::vector<AccessEntry> access;
-  };
-  // override visitor pattern
-  void VisitExpr_(const BufferLoadNode *op) final;
-  void VisitStmt_(const BufferStoreNode *op) final;
-  void VisitStmt_(const EvaluateNode *op) final;
-  void VisitStmt_(const LetStmtNode *op) final;
-  void VisitStmt_(const AttrStmtNode *op) override;
-  void VisitStmt_(const ForNode *op) final;
-  void VisitStmt_(const IfThenElseNode *op) final;
-  void VisitStmt_(const WhileNode *op) final;
-  void VisitExpr_(const CallNode *op) final;
-  void VisitStmt_(const BlockNode *op) final;
-
-  void SetBufferDataToBuffer(const Var &buffer_var, const Buffer &buffer) {
-    buffer_data_to_buffer_.Set(buffer_var, buffer);
-  }
-
-protected:
-  TileLangStorageAccessVisitor() { scope_.push_back(std::vector<StmtEntry>()); }
-  /*! \return number of conditions in the current scope. */
-  int condition_counter() const { return condition_counter_; }
-  /*! \return whether we are in device environment. */
-  bool in_device_env() const { return in_device_env_; }
-  /*! \return environment threads */
-  const Array<IterVar> &env_threads() const { return env_threads_; }
-  /*!
-   * \brief Whether we need analyze the buffer in current scope.
-   * \param buffer The buffer to be checked
-   * \param scope The scope of the buffer.
-   * \return Whether the analysis of buffer is enabled.
-   */
-  virtual bool Enabled(const VarNode *buffer, const StorageScope &scope) const {
-    return true;
-  }
-  /*!
-   * \brief Summarize the sequence of operations into parent.
-   *
-   *  Insert synchronization if necessary and remove un-necessary
-   *  memory access which are already synced.
-   *
-   * \param seq The sequence of the access operations.
-   * \param loop Pass loop node if it is a loop, otherwise nullptr.
-   * \return The summarized sequence that represent access that
-   *  the parent should taken care of to synchronize.
-   */
-  virtual std::vector<AccessEntry> Summarize(std::vector<StmtEntry> seq,
-                                             const ForNode *loop) = 0;
-
-  /*!
-   * \brief Compute the thread range for the given threads.
-   * \param threads The threads to compute the range for.
-   * \return The thread range.
-   */
-  Map<Var, Range> ComputeThreadRange(const Array<IterVar> &threads);
-
-  /*!
-   * \brief Get the scope of the buffer array.
-   * \return The scope of the final buffer array.
-   */
-  StorageScope GetScope(const Var &buffer_var) const;
-  // access scope
-  std::vector<std::vector<StmtEntry>> scope_;
-
-private:
-  // whether access appending is enabled.
-  bool allow_append_{false};
-  // Whether we are in device environment
-  bool in_device_env_{false};
-  // Nesting depth of tma_load/tma_load_im2col calls
-  int tma_depth_{0};
-  // Whether we are inside condition.
-  int condition_counter_{0};
-  // The current double buffer write scope.
-  const VarNode *double_buffer_write_{nullptr};
-  // the current free stmt entry.
-  StmtEntry curr_stmt_;
-  // The involving threads
-  Array<IterVar> env_threads_;
-  // The buffer map
-  Map<Var, Buffer> buffer_data_to_buffer_;
-};
-} // namespace tl
-} // namespace tvm
-#endif // TVM_TL_TRANSFORMS_STORAGE_ACCESS_H_
diff --git a/src/transform/storage_rewrite.cc b/src/transform/storage_rewrite.cc
index 40973f39a..353523319 100644
--- a/src/transform/storage_rewrite.cc
+++ b/src/transform/storage_rewrite.cc
@@ -1109,6 +1109,9 @@ class StoragePlanRewriter : public StmtExprMutator {
         // when not divided, no reuse, eg, float4 vs float3
         if (e->bits_offset % op_elem_bits != 0)
           continue;
+        // must check element type to avoid type mismatch in codegen
+        if (e->elem_type != op->dtype.element_of())
+          continue;
         if (reuse_require_exact_matched_dtype && e->elem_type != op->dtype) {
           continue;
         }
@@ -1962,21 +1965,12 @@ Pass StorageRewrite() {
         ctx->GetConfig<Bool>(kStorageRewriteDetectInplace, Bool(false)).value();
     bool enable_reuse = true;
     bool reuse_require_exact_matched_dtype = false;
-    bool merge_static_smem =
-        ctx->GetConfig<Bool>("tir.merge_static_smem", Bool(false)).value();
     AllocateCollector collector;
     collector(f->body);
-    bool has_dynamic = collector.dyn_shmem_allocs_.size() > 1;
-    if (has_dynamic || merge_static_smem) {
-      // For IRModule utilizing dynamic shared memory, reuse is not enabled
-      // Because dynamic doesn't require maintaining the readability and
-      // it benefits from a more optimized allocation strategy through the
-      // Pass `MergeSharedMemoryAllocations`.
-      // When `merge_static_smem` is true, we will reuse and merge shared
-      // memory in a dedicated pass `MergeSharedMemoryAllocations`.
-      // And so we don't enable reuse in this pass.
-      enable_reuse = false;
-    }
+    // Always disable reuse currently, for shared memory reuse we depend on
+    // MergeSharedMemoryAllocations pass, for register reuse we depend on nvcc
+    // or other compiler its self.
+    enable_reuse = false;
 
     Optional<Target> target = f->GetAttr<Target>("target");
     if (target.defined() && (target.value()->kind->name == "vulkan" ||
diff --git a/src/transform/thread_storage_sync.cc b/src/transform/thread_storage_sync.cc
index 0627678e1..9170148d0 100644
--- a/src/transform/thread_storage_sync.cc
+++ b/src/transform/thread_storage_sync.cc
@@ -20,405 +20,378 @@
 /*!
  * \file thread_storage_sync.cc
  */
+#include "../op/builtin.h"
+#include "./common/constr_visitor.h"
+#include "./common/thread_sync_types.h"
+#include "arith/ir_mutator_with_analyzer.h"
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+#include <string>
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/int_set.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/attrs.h>
+#include <tvm/target/target_info.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
-
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
-
-#include "../op/builtin.h"
-#include "./common/thread_sync_types.h"
-#include "./storage_access.h"
-#include "arith/ir_mutator_with_analyzer.h"
-#include "runtime/thread_storage_scope.h"
-#include "tir/transforms/ir_utils.h"
+#include <vector>
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace ffi;
 using arith::IRMutatorWithAnalyzer;
+using runtime::StorageRank;
+using runtime::StorageScope;
+
+// There are cases where necessary syncthreads is not inserted by
+// ThreadSyncInserter. For example, syncthreads is needed after async_wait_queue
+// in the second loop below, but since ThreadSyncInserter is not aware of the
+// asynchronous semantics, it cannot tell that the syncthreads is needed there.
+//
+// // Pipeline prologue
+// for i in range(125):
+//    async_commit_queue(0):
+//       async_scope:
+//          shared[(i + 3) % 4] = ...
+// ...
+//
+// // Pipeline Epilogue
+// for i in range(3):
+//    async_wait_queue(0, 2 - i):
+//       local[...] = shared[(i + 125) % 4]
 
-class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
+// This class adds syncthreads after all async_wait_queue. That includes
+// syncthreads that can be inserted by ThreadSyncInserter as well, but
+// ThreadSyncInserter will not insert duplicate syncthreads if it finds an
+// existing one at the synchronization point.
+class ThreadSyncAfterWaitQueueInserter : public StmtExprMutator {
 public:
-  explicit TileLangThreadSyncPlanner(StorageScope sync_scope)
+  explicit ThreadSyncAfterWaitQueueInserter(StorageScope sync_scope)
       : sync_scope_(std::move(sync_scope)) {}
 
-  // The syncs inserted before each statement
-  std::unordered_set<const Object *> syncs_inserted_;
-
-protected:
-  bool Enabled(const VarNode *buf, const StorageScope &scope) const final {
-    return in_device_env() && scope == sync_scope_;
-  }
-  // Plan the sync
-  std::vector<AccessEntry> Summarize(std::vector<StmtEntry> seq,
-                                     const ForNode *loop) final {
-    // Redirect all "shared.dyn" buffer access to the same buffer var
-    // so that the accesses can be planned together.
-    Var shared_dyn_buf;
-    for (StmtEntry &entry : seq) {
-      for (AccessEntry &access : entry.access) {
-        if (access.scope.rank == StorageRank::kShared &&
-            access.scope.tag == ".dyn" && access.buffer.defined()) {
-          if (!shared_dyn_buf.defined()) {
-            shared_dyn_buf = access.buffer;
-          } else {
-            access.buffer = shared_dyn_buf;
-          }
-        }
-      }
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tvm::tir::attr::async_wait_queue_scope) {
+      auto sync = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+                                {StringImm(sync_scope_.to_string())}));
+      auto inner = op->body.as<AttrStmtNode>();
+      ICHECK(inner &&
+             inner->attr_key == tvm::tir::attr::async_wait_inflight_count);
+      auto zero = make_zero(DataType::Int(32));
+      auto new_body = SeqStmt({sync, inner->body});
+      return AttrStmt(zero, tvm::tir::attr::async_wait_queue_scope, op->value,
+                      AttrStmt(zero, tvm::tir::attr::async_wait_inflight_count,
+                               inner->value, new_body));
     }
+    return StmtExprMutator::VisitStmt_(op);
+  }
 
-    // Unsynced reads and writes
-    std::vector<AccessEntry> reads;
-    std::vector<AccessEntry> writes;
-    // if it is a loop, rotate two times to consider effect of loop.
-    // simulation based approach to find dependencies
-    for (size_t i = 0; i < seq.size(); ++i) {
-      const StmtEntry &s = seq[i];
-      // check if sync before statement is needed.
-      bool sync_before_stmt = (syncs_inserted_.count(s.stmt) != 0);
-      // Apply the syncs added already.
+private:
+  StorageScope sync_scope_;
+};
 
-      if (sync_before_stmt) {
-        reads.clear();
-        writes.clear();
-      }
+class ThreadSyncInserter : public StmtExprMutator {
+public:
+  ThreadSyncInserter(StorageScope sync_scope,
+                     const std::unordered_set<const Object *> &syncs)
+      : sync_scope_(std::move(sync_scope)), syncs_(syncs) {}
 
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kRead) {
-          if (FindConflict(writes, acc, false)) {
-            sync_before_stmt = true;
-            break;
-          }
-        } else if (acc.type == kWrite) {
-          if (FindConflict(reads, acc, false) ||
-              FindConflict(writes, acc, false)) {
-            sync_before_stmt = true;
-            break;
-          }
-        } else if (acc.type == kSync) {
-          reads.clear();
-          writes.clear();
-        }
-      }
-      // If sync is inserted. remove the irrelevant things.
-      if (sync_before_stmt) {
-        reads.clear();
-        writes.clear();
+  Stmt VisitStmt(const Stmt &stmt) final {
+    if (syncs_.empty())
+      return stmt;
+    if (syncs_.count(stmt.get())) {
+      Stmt barrier;
+      if (sync_scope_.rank == StorageRank::kGlobal) {
+        barrier = MakeGlobalBarrier();
+      } else {
+        barrier = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+                                {StringImm(sync_scope_.to_string())}));
       }
-      // Add the read/write of current statement
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kRead) {
-          reads.push_back(acc);
-        } else if (acc.type == kWrite) {
-          writes.push_back(acc);
-        } else if (acc.type == kSync) {
-          reads.clear();
-          writes.clear();
-        }
+      // Mutate after query, to avoid stmt change.
+      auto ret = StmtExprMutator::VisitStmt(stmt);
+      ret = SeqStmt({barrier, ret});
+      return ret;
+    } else {
+      return StmtExprMutator::VisitStmt(stmt);
+    }
+  }
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    if (sync_scope_.rank == StorageRank::kGlobal &&
+        GetScope(op->buffer->data).rank == StorageRank::kGlobal) {
+      ++rw_stats_[op->buffer->data].read_count;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    if (sync_scope_.rank == StorageRank::kGlobal &&
+        GetScope(op->buffer->data).rank == StorageRank::kGlobal) {
+      ++rw_stats_[op->buffer->data].write_count;
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tvm::tir::attr::thread_extent) {
+      bool temp = true;
+      std::swap(temp, in_thread_env_);
+      thread_extents_.push_back(op);
+      Stmt ret = StmtExprMutator::VisitStmt_(op);
+      thread_extents_.pop_back();
+      std::swap(temp, in_thread_env_);
+      // first thread scope.
+      if (!in_thread_env_ && sync_scope_.rank == StorageRank::kGlobal) {
+        ret = InitGlobalBarrier(ret.as<AttrStmtNode>());
+        num_blocks_ = PrimExpr();
+        is_lead_ = PrimExpr();
       }
+      return ret;
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
 
-      if (sync_before_stmt) {
-        insert_syncs(s.stmt);
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+      op = expr.as<CallNode>();
+      ICHECK_EQ(op->args.size(), 5U);
+      Var buffer_var(Downcast<Var>(op->args[1]));
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      if ((flag->value & 1) && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[buffer_var].read_count;
       }
-    }
-    if (loop != nullptr) {
-      // Check if the loop body contains any reads in the same sync scope.
-      // If there are reads, we conservatively keep the sync within the loop
-      // body to preserve per-iteration ordering when needed. If there are no
-      // reads (e.g., only writes to shared.dyn), we can safely hoist the sync
-      // to before the loop to avoid redundant barriers.
-      bool has_read_in_scope = false;
-      for (const StmtEntry &s : seq) {
-        for (const AccessEntry &acc : s.access) {
-          if (acc.type == kRead && acc.scope == sync_scope_) {
-            has_read_in_scope = true;
-            break;
-          }
-        }
-        if (has_read_in_scope)
-          break;
+      if (flag->value & 2 && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[buffer_var].write_count;
       }
-      // If there is a loop-carried dependency, insert a single sync
-      // before the loop rather than hoisting a sync into the loop body.
-      // This reduces redundant per-iteration synchronizations for cases
-      // where each iteration touches disjoint regions (e.g., stmatrix
-      // writes to shared.dyn) and only a global ordering before/after the
-      // loop is required.
-      for (size_t i = 0; i < seq.size(); ++i) {
-        const StmtEntry &s = seq[i];
-        if (syncs_inserted_.count(s.stmt) != 0)
-          break;
-        if (reads.empty() && writes.empty())
-          break;
-        bool need_loop_sync = false;
-        for (const AccessEntry &acc : s.access) {
-          if (acc.type == kRead) {
-            if (FindConflict(writes, acc, true)) {
-              need_loop_sync = true;
-              break;
-            }
-          } else if (acc.type == kWrite) {
-            if (FindConflict(reads, acc, true) ||
-                FindConflict(writes, acc, true)) {
-              need_loop_sync = true;
-              break;
-            }
-          } else if (acc.type == kSync) {
-            reads.clear();
-            writes.clear();
-          }
+      return expr;
+    } else if (op->op.same_as(builtin::address_of())) {
+      PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+      op = expr.as<CallNode>();
+      ICHECK_EQ(op->args.size(), 1U)
+          << "address_of should only have one argument (Buffer)";
+
+      if (auto load = op->args[0].as<BufferLoadNode>()) {
+        Var buffer_var(Downcast<Var>(load->buffer->data));
+        if (sync_scope_.rank == StorageRank::kGlobal &&
+            GetScope(buffer_var).rank == StorageRank::kGlobal) {
+          ++rw_stats_[buffer_var].read_count;
         }
-        if (need_loop_sync) {
-          if (!has_read_in_scope) {
-            // Mark the loop itself to receive a sync before it, instead of
-            // inserting inside the loop body. This ensures a single sync is
-            // emitted outside the loop and avoids per-iteration overhead.
-            insert_syncs(loop);
-          } else {
-            // Fall back to inserting before the first conflicting statement
-            // inside the loop to maintain correctness when reads are present.
-            insert_syncs(s.stmt);
-          }
-          break;
+        if (sync_scope_.rank == StorageRank::kGlobal &&
+            GetScope(buffer_var).rank == StorageRank::kGlobal) {
+          ++rw_stats_[buffer_var].write_count;
         }
+        return expr;
+      } else {
+        return StmtExprMutator::VisitExpr_(op);
       }
+    } else {
+      return StmtExprMutator::VisitExpr_(op);
     }
-    // return the exposed entries, remove unnecessary ones.
-    int sync_count = 0;
-    // head are before first sync, tail are after last sync
-    std::vector<AccessEntry> head, tail;
-    AccessEntry esync;
-    esync.threads = this->env_threads();
-    esync.thread_range = this->ComputeThreadRange(esync.threads);
-    esync.type = kSync;
-    esync.scope = sync_scope_;
-
-    for (const StmtEntry &s : seq) {
-      if (syncs_inserted_.count(s.stmt)) {
-        if (sync_count != 0) {
-          tail.clear();
-        } else {
-          head.push_back(esync);
-        }
-        ++sync_count;
-      }
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kSync) {
-          if (sync_count != 0) {
-            tail.clear();
-          } else {
-            head.push_back(esync);
-          }
-          ++sync_count;
-        } else {
-          if (sync_count != 0) {
-            tail.push_back(acc);
-          } else {
-            head.push_back(acc);
-          }
-        }
-      }
-    }
-    head.insert(head.end(), tail.begin(), tail.end());
-    if (loop != nullptr) {
-      // clear double buffer flag after a loop is finished.
-      for (AccessEntry &e : head) {
-        e.double_buffer_write = false;
-      }
-    }
-    return head;
   }
 
 private:
-  // find conflicting entry in vec.
-  bool FindConflict(const std::vector<AccessEntry> &prev,
-                    const AccessEntry &curr, bool loop_carry) {
-    for (const AccessEntry &x : prev) {
-      if (FindConflict(x, curr, loop_carry)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  bool FindConflict(const AccessEntry &prev, const AccessEntry &curr,
-                    bool loop_carry) {
-    // Special case: ignore conflicts between async-copy writes (e.g., TMA
-    // loads into shared memory). Multiple async writes do not require
-    // interspersed barriers among themselves. We still respect conflicts with
-    // reads to ensure visibility before consumption.
-    if (prev.type == kWrite && curr.type == kWrite && prev.is_async_copy &&
-        curr.is_async_copy) {
-      return false;
-    }
-    // Access to different buffers does not conflict.
-    if (!prev.buffer.same_as(curr.buffer)) {
-      return false;
-    }
+  // RW statistics about data
+  struct Entry {
+    int read_count{0};
+    int write_count{0};
+  };
 
-    // Assumes no race between threads
-    // Same index value means no conflicts
-    // TODO(tqchen) more standard set based testing.
-    bool has_same_index = true;
-    bool range_is_equal = true;
-    bool range_is_overlap = true;
+  // Get current storage scope.
+  StorageScope GetScope(Var buffer_var) const {
+    return StorageScope::Create(GetPtrStorageScope(std::move(buffer_var)));
+  }
 
-    for (const auto &kv : prev.thread_range) {
-      if (!StructuralEqual()(kv.second, curr.thread_range[kv.first])) {
-        range_is_equal = false;
-        break;
+  // private functions.
+  Stmt InitGlobalBarrier(const AttrStmtNode *op) {
+    ICHECK(op != nullptr);
+    Array<PrimExpr> pargs = {
+        StringImm(runtime::symbol::tvm_prepare_global_barrier)};
+    Stmt prep =
+        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+    Stmt body = op->body;
+    for (const auto &kv : rw_stats_) {
+      const auto &e = kv.second;
+      if (e.read_count != 0 && e.write_count != 0) {
+        body = AttrStmt(kv.first, tvm::tir::attr::volatile_scope, 1, body);
       }
     }
-
-    if (prev.buffer_indices.size() != curr.buffer_indices.size()) {
-      // They are not the same indices, should be conflict.
-      return true;
-    }
-    if (prev.is_pointer_access || curr.is_pointer_access) {
-      // For accesses created via tvm_access_ptr we may still be able to prove
-      // disjointness using their byte ranges.  If both sides expose a touched
-      // interval and we can show they don't overlap, skip the conflict.
-      if (prev.is_pointer_access && curr.is_pointer_access &&
-          PointerAccessIsDisjoint(prev, curr)) {
-        return false;
+    rw_stats_.clear();
+    Stmt kinit = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_global_barrier_kinit(), {}));
+    body = SeqStmt({kinit, body});
+    body = AttrStmt(op->node, op->attr_key, op->value, body);
+    return SeqStmt({prep, body});
+  }
+  Stmt MakeGlobalBarrier() {
+    ICHECK(sync_scope_.rank == StorageRank::kGlobal);
+    if (!num_blocks_.defined()) {
+      ICHECK(!is_lead_.defined());
+      num_work_dim_ = thread_extents_.size();
+      for (const AttrStmtNode *attr : thread_extents_) {
+        IterVar iv = Downcast<IterVar>(attr->node);
+        runtime::ThreadScope s = runtime::ThreadScope::Create(iv->thread_tag);
+        if (s.rank == 0) {
+          num_blocks_ =
+              (num_blocks_.defined() ? attr->value * num_blocks_ : attr->value);
+        } else if (s.rank == 1) {
+          PrimExpr cond = iv->var == make_zero(iv->var.dtype());
+          is_lead_ = is_lead_.defined() ? (is_lead_ && cond) : cond;
+        }
       }
-      // Otherwise fall back to the conservative answer: treat them as
-      // overlapping.
-      return true;
+    } else {
+      ICHECK_EQ(num_work_dim_, thread_extents_.size());
     }
+    return Evaluate(
+        Call(DataType::Int(32), builtin::tvm_storage_sync(),
+             {StringImm(sync_scope_.to_string()), is_lead_, num_blocks_}));
+  }
+  // data structure.
+  StorageScope sync_scope_;
+  const std::unordered_set<const Object *> &syncs_;
 
-    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
-      auto prev_dtype = prev.dtype;
-      auto curr_dtype = curr.dtype;
-
-      const auto &prev_indice = prev.buffer_indices[i];
-      const auto &curr_indice = curr.buffer_indices[i];
-
-      if (!ExprDeepEqual()(prev_indice, curr_indice)) {
-        PrimExpr prev_indice_bytes =
-            analyzer_.Simplify(prev_indice * prev_dtype.bytes());
-        PrimExpr curr_indice_bytes =
-            analyzer_.Simplify(curr_indice * curr_dtype.bytes());
-
-        has_same_index = false;
-
-        // If both are const, we can check if they are disjoint
-        // by checking if the bounds are disjoint
-        // [1024, 2048], [2048, 3072] are disjoint
-        // [1024, 2048], [1024, 1024] are not disjoint
-        auto prev_bound = analyzer_.const_int_bound(prev_indice_bytes);
-        auto curr_bound = analyzer_.const_int_bound(curr_indice_bytes);
-        if (prev_bound.defined() && curr_bound.defined()) {
-          if ((prev_bound->min_value) > (curr_bound->max_value) ||
-              (curr_bound->min_value) > (prev_bound->max_value)) {
-            range_is_overlap = false;
-            break;
-          }
-        }
+  // The read write statistics of storage
+  std::unordered_map<Var, Entry, ObjectPtrHash, ObjectPtrEqual> rw_stats_;
+  // The statistics for global barrier
+  bool in_thread_env_{false};
+  // memorized results
+  std::vector<const AttrStmtNode *> thread_extents_;
+  size_t num_work_dim_{0};
+  PrimExpr num_blocks_;
+  PrimExpr is_lead_;
+};
 
-        // if we can prove prev_indice < curr_indice or prev_indice >
-        // curr_indice, then they are not overlap
-        auto prev_indices_dtype = prev_indice.dtype();
-        auto curr_indices_dtype = curr_indice.dtype();
-        if (prev_indices_dtype.lanes() != curr_indices_dtype.lanes()) {
-          // can not support different lanes binary op like <, >, <=, >=
-          // skip otherwise it will lead to error
-          continue;
-        }
+class ThreadPartialSyncRewriter : public IRMutatorWithAnalyzer {
+public:
+  static Stmt Rewrite(Stmt stmt) {
+    arith::Analyzer analyzer;
+    ThreadPartialSyncRewriter rewriter(&analyzer);
+    return rewriter(std::move(stmt));
+  }
 
-        // provably disjoint means no overlap, for example:
-        // we can prove that tx - 128 < tx + 128, tx in [0, 128]
-        // However, we should apply tx split because
-        // tx < tx + 32 when tx in [0, 128] is not disjoint
-        // because [0, 128] is not disjoint with [32, 160]
-        // so we should split tx into tx0 and tx1.
-
-        struct ThreadVarInfo {
-          const char *name_prev;
-          const char *name_curr;
-          IterVar iv;
-        } thread_vars[] = {
-            {"tx1", "tx2", tx_},
-            {"ty1", "ty2", ty_},
-            {"tz1", "tz2", tz_},
-        };
-
-        for (const auto &info : thread_vars) {
-          Var prev_var(info.name_prev, info.iv->var.dtype());
-          Var curr_var(info.name_curr, info.iv->var.dtype());
-          analyzer_.Bind(prev_var, info.iv->dom);
-          analyzer_.Bind(curr_var, info.iv->dom);
-          prev_indice_bytes =
-              Substitute(prev_indice_bytes, {{info.iv->var, prev_var}});
-          curr_indice_bytes =
-              Substitute(curr_indice_bytes, {{info.iv->var, curr_var}});
-        }
+private:
+  explicit ThreadPartialSyncRewriter(arith::Analyzer *analyzer)
+      : IRMutatorWithAnalyzer(analyzer) {}
 
-        bool provably_disjoint =
-            analyzer_.CanProve(prev_indice_bytes < curr_indice_bytes,
-                               arith::ProofStrength::kSymbolicBound) ||
-            analyzer_.CanProve(prev_indice_bytes > curr_indice_bytes,
-                               arith::ProofStrength::kSymbolicBound);
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    const CallNode *call = nullptr;
+    if (op->value->IsInstance<CallNode>()) {
+      call = op->value.as<CallNode>();
+      if (call->op.same_as(builtin::tvm_storage_sync())) {
+        const auto &args = call->args;
+        ICHECK(!args.empty());
+        const auto *scope_node = args[0].as<StringImmNode>();
+        ICHECK(scope_node != nullptr);
+        const std::string &scope = scope_node->value;
 
-        if (provably_disjoint) {
-          range_is_overlap = false;
-          break;
+        if (args.size() != 1 || (scope != "shared" && scope != "shared.dyn")) {
+          return IRMutatorWithAnalyzer::VisitStmt_(op);
         }
-      }
 
-      if (!has_same_index) {
-        break;
+        return ProcessSharedSync(call, scope);
       }
     }
+    return IRMutatorWithAnalyzer::VisitStmt_(op);
+  }
 
-    if (has_same_index && range_is_equal) {
-      return false;
+  Stmt ProcessSharedSync(const CallNode *op, const std::string &scope) {
+    // Get thread bounds
+    auto bound_tx = analyzer_->const_int_bound(tx_);
+    auto bound_ty = analyzer_->const_int_bound(ty_);
+    auto bound_tz = analyzer_->const_int_bound(tz_);
+
+    // Check if all threads are participating (full extent)
+    if (IsFullThreadExtent(tx_, bound_tx) &&
+        IsFullThreadExtent(ty_, bound_ty) &&
+        IsFullThreadExtent(tz_, bound_tz)) {
+      return Evaluate(IRMutatorWithAnalyzer::VisitExpr_(op));
     }
 
-    // If this is a read into a double buffer that was previously
-    // swapped out, then it doesn't conflict.
-    if (prev.double_buffer_write && curr.type == kRead && !loop_carry) {
-      return false;
+    // Calculate thread extents
+    auto extent_tx = CalculateThreadExtent(tx_, bound_tx);
+    auto extent_ty = CalculateThreadExtent(ty_, bound_ty);
+    auto extent_tz = CalculateThreadExtent(tz_, bound_tz);
+
+    // Create or get barrier info
+    ThreadBoundKey key{bound_tx->min_value, bound_tx->max_value,
+                       bound_ty->min_value, bound_ty->max_value,
+                       bound_tz->min_value, bound_tz->max_value};
+
+    auto [barrier_id, thread_count] =
+        GetOrCreateBarrier(key, extent_tx, extent_ty, extent_tz);
+    if (thread_count % 32 != 0) {
+      // TODO(lei): This is a workaround for the case where the thread count is
+      // not a multiple of 32. we should enhance the pass to analysis index
+      // instead of buffer expression etc.
+      return Stmt();
     }
 
-    // If nothing else allows sharing the same buffer, then they are
-    // in conflict.
-    // if range_is_overlap is true, then they are in conflict, we should return
-    // true. if range_is_overlap is false, then they are not in conflict, we
-    // should return false.
-    return range_is_overlap;
+    // Create new sync call with barrier info
+    Array<PrimExpr> new_args = {StringImm(scope),
+                                IntImm(DataType::Int(32), barrier_id),
+                                IntImm(DataType::Int(32), thread_count)};
+    return Evaluate(Call(op->dtype, op->op, new_args));
   }
 
-  bool PointerAccessIsDisjoint(const AccessEntry &lhs, const AccessEntry &rhs) {
-    if (lhs.touched.size() != 1 || rhs.touched.size() != 1) {
-      return false;
+  std::pair<size_t, size_t> GetOrCreateBarrier(const ThreadBoundKey &key,
+                                               size_t extent_tx,
+                                               size_t extent_ty,
+                                               size_t extent_tz) {
+    if (barrier_id_map_.count(key)) {
+      return {barrier_id_map_[key], thread_count_map_[key]};
     }
-    PrimExpr lhs_min = analyzer_.Simplify(lhs.touched[0].min());
-    PrimExpr lhs_max = analyzer_.Simplify(lhs.touched[0].max());
-    PrimExpr rhs_min = analyzer_.Simplify(rhs.touched[0].min());
-    PrimExpr rhs_max = analyzer_.Simplify(rhs.touched[0].max());
 
-    if (analyzer_.CanProve(lhs_max < rhs_min,
-                           arith::ProofStrength::kSymbolicBound)) {
-      return true;
+    size_t barrier_id =
+        barrier_id_map_.size() +
+        static_cast<size_t>(ReservedNamedBarriers::kFirstUsedBarrier);
+    size_t thread_count = extent_tx * extent_ty * extent_tz;
+
+    barrier_id_map_[key] = barrier_id;
+    thread_count_map_[key] = thread_count;
+
+    return {barrier_id, thread_count};
+  }
+
+  /*!
+   * \brief Calculate the number of threads that satisfy current constraints.
+   *
+   * This method uses Z3's model enumeration (AllSAT) to precisely count
+   * how many thread IDs satisfy all current constraints. This is essential
+   * for cases like `if (threadIdx.x % 4 == 0)` where const_int_bound only
+   * gives us the range [0, 127] but the actual number of satisfying threads
+   * is 32 (i.e., 0, 4, 8, ..., 124).
+   *
+   * Falls back to range-based calculation if Z3 enumeration fails or returns
+   * an invalid result.
+   */
+  size_t CalculateThreadExtent(const IterVar &iv,
+                               const arith::ConstIntBound &bound) {
+    if (!analyzer_->const_int_bound.IsBound(iv->var)) {
+      return 1;
     }
-    if (analyzer_.CanProve(rhs_max < lhs_min,
-                           arith::ProofStrength::kSymbolicBound)) {
-      return true;
+    auto extent = *as_const_int(iv->dom->extent);
+    // Always use Z3 enumeration to count satisfying values.
+    // This handles constraints like `tx % 4 == 0` that const_int_bound cannot
+    // detect. Z3 enumeration will return the exact count of satisfying values.
+    int64_t z3_count =
+        analyzer_->z3_prover.CountSatisfyingValues(iv->var, extent);
+    if (z3_count > 0) {
+      return static_cast<size_t>(z3_count);
     }
-    return false;
+
+    // Fallback to range-based calculation if Z3 enumeration failed
+    return static_cast<size_t>(bound->max_value - bound->min_value + 1);
   }
 
-  void VisitStmt_(const AttrStmtNode *op) final {
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tvm::tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
@@ -429,16 +402,29 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
         tz_ = iv;
       }
     }
-    TileLangStorageAccessVisitor::VisitStmt_(op);
+    return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
-  void insert_syncs(const Object *obj) {
-    if (syncs_inserted_.count(obj))
-      return;
-    syncs_inserted_.insert(obj);
-  }
+  bool IsFullThreadExtent(const IterVar &iv,
+                          const arith::ConstIntBound &bound) {
+    if (!analyzer_->const_int_bound.IsBound(iv->var)) {
+      return true;
+    }
+
+    if (!iv->dom.defined()) {
+      return true;
+    }
+
+    const auto *min_node = iv->dom->min.as<IntImmNode>();
+    const auto *extent_node = iv->dom->extent.as<IntImmNode>();
+
+    int64_t min = min_node->value;
+    int64_t extent = extent_node->value;
+    int64_t max = min + extent - 1;
+
+    return min == bound->min_value && max == bound->max_value;
+  }
 
-private:
   // Member variables
   IterVar tx_ =
       IterVar(Range::FromMinExtent(0, 1), Var("tx"), IterVarType::kDataPar);
@@ -446,369 +432,1220 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
       IterVar(Range::FromMinExtent(0, 1), Var("ty"), IterVarType::kDataPar);
   IterVar tz_ =
       IterVar(Range::FromMinExtent(0, 1), Var("tz"), IterVarType::kDataPar);
-  // synchronization scope
-  StorageScope sync_scope_;
+  std::unordered_map<ThreadBoundKey, size_t> barrier_id_map_;
+  std::unordered_map<ThreadBoundKey, size_t> thread_count_map_;
 };
 
-// There are cases where necessary syncthreads is not inserted by
-// ThreadSyncInserter. For example, syncthreads is needed after async_wait_queue
-// in the second loop below, but since ThreadSyncInserter is not aware of the
-// asynchronous semantics, it cannot tell that the syncthreads is needed there.
-//
-// // Pipeline prologue
-// for i in range(125):
-//    async_commit_queue(0):
-//       async_scope:
-//          shared[(i + 3) % 4] = ...
-// ...
-//
-// // Pipeline Epilogue
-// for i in range(3):
-//    async_wait_queue(0, 2 - i):
-//       local[...] = shared[(i + 125) % 4]
-
-// This class adds syncthreads after all async_wait_queue. That includes
-// syncthreads that can be inserted by ThreadSyncInserter as well, but
-// ThreadSyncInserter will not insert duplicate syncthreads if it finds an
-// existing one at the synchronization point.
-class ThreadSyncAfterWaitQueueInserter : public StmtExprMutator {
+/*!
+ * \brief Check if an if-condition depends on runtime-variable values.
+ *
+ * For sync hoisting decisions, we distinguish two types of non-uniform
+ * conditions:
+ *
+ * 1. Conditions that only depend on threadIdx (e.g., `threadIdx.x >= 512`):
+ *    - The number of threads entering the if can be determined at compile time
+ *    - ThreadPartialSyncRewriter can handle this by computing the thread count
+ *    - No need to hoist sync
+ *
+ * 2. Conditions that depend on runtime values (e.g., `shared_mem[tx] != -1`):
+ *    - Cannot determine at compile time how many threads will enter
+ *    - Must hoist sync to before the if to avoid potential deadlock
+ *
+ * This checker identifies case (2) - conditions that depend on runtime values.
+ */
+class RuntimeDependentConditionChecker : public IRMutatorWithAnalyzer {
 public:
-  explicit ThreadSyncAfterWaitQueueInserter(StorageScope sync_scope)
-      : sync_scope_(std::move(sync_scope)) {}
+  explicit RuntimeDependentConditionChecker(arith::Analyzer *analyzer,
+                                            int warp_size = 32)
+      : IRMutatorWithAnalyzer(analyzer), warp_size_(warp_size) {}
+
+  /*!
+   * \brief Check if expression depends on runtime-variable values.
+   * \return true if the expression depends on values that cannot be determined
+   *         at compile time (e.g., shared memory loads), false if it only
+   *         depends on compile-time known values (constants, threadIdx,
+   * blockIdx).
+   */
+  bool DependsOnRuntimeValue(const PrimExpr &expr, const IterVar &iv) {
+    depends_on_runtime_ = false;
+    this->VisitExpr(expr);
+    auto extent_opt = as_const_int(iv->dom->extent);
+    ICHECK(extent_opt != nullptr)
+        << "DependsOnRuntimeValue: thread extent must be a "
+           "constant, but got: "
+        << iv->dom->extent;
+    int64_t thread_extent = *extent_opt;
+    {
+      With<arith::ConstraintContext> ctx(analyzer_, expr);
+      auto count = analyzer_->z3_prover.CountSatisfyingValues(
+          iv->var, thread_extent, /*min_consecutive=*/warp_size_);
+      if (count < 0) {
+        // failed to count satisfying values, return true
+        depends_on_runtime_ = true;
+      }
+    }
+    return depends_on_runtime_;
+  }
 
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tvm::tir::attr::async_wait_queue_scope) {
-      auto sync = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
-                                {StringImm(sync_scope_.to_string())}));
-      auto inner = op->body.as<AttrStmtNode>();
-      ICHECK(inner &&
-             inner->attr_key == tvm::tir::attr::async_wait_inflight_count);
-      auto zero = make_zero(DataType::Int(32));
-      auto new_body = SeqStmt({sync, inner->body});
-      return AttrStmt(zero, tvm::tir::attr::async_wait_queue_scope, op->value,
-                      AttrStmt(zero, tvm::tir::attr::async_wait_inflight_count,
-                               inner->value, new_body));
+private:
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    // Any buffer load introduces runtime dependency
+    // (we don't know the buffer contents at compile time)
+    depends_on_runtime_ = true;
+    return IRMutatorWithAnalyzer::VisitExpr_(op);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    // Check tvm_access_ptr and address_of - if used in condition, it's reading
+    // memory
+    if (op->op.same_as(builtin::tvm_access_ptr()) ||
+        op->op.same_as(builtin::address_of())) {
+      depends_on_runtime_ = true;
+      return IRMutatorWithAnalyzer::VisitExpr_(op);
     }
-    return StmtExprMutator::VisitStmt_(op);
+    // Other calls might also introduce runtime dependency
+    // but we'll be conservative and check children
+    return IRMutatorWithAnalyzer::VisitExpr_(op);
   }
 
 private:
-  StorageScope sync_scope_;
+  bool depends_on_runtime_{false};
+  int warp_size_;
 };
 
-class ThreadSyncInserter : public StmtExprMutator {
-public:
-  ThreadSyncInserter(StorageScope sync_scope,
-                     const std::unordered_set<const Object *> &syncs)
-      : sync_scope_(std::move(sync_scope)), syncs_(syncs) {}
+struct TileLangThreadSyncPlanner : public ConstrVisitor {
+  explicit TileLangThreadSyncPlanner(StorageScope sync_scope,
+                                     int warp_size = 32)
+      : sync_scope_(std::move(sync_scope)), warp_size_(warp_size) {
+    scope_.push_back(std::vector<StmtEntry>());
+  }
 
-  Stmt VisitStmt(const Stmt &stmt) final {
-    if (syncs_.empty())
-      return stmt;
-    if (syncs_.count(stmt.get())) {
-      Stmt barrier;
-      if (sync_scope_.rank == StorageRank::kGlobal) {
-        barrier = MakeGlobalBarrier();
-      } else {
-        barrier = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
-                                {StringImm(sync_scope_.to_string())}));
+  /*! \brief Storage access type */
+  enum AccessType : uint8_t {
+    kRead,
+    kWrite,
+    kSync,
+    kAlloc,
+    // acquired version of read, only need to handle WAR dep.
+    kReadAcquire
+  };
+  /*! \brief An access entry */
+  struct AccessEntry {
+    /*! \brief The thread index that access this entry */
+    Array<IterVar> threads;
+    /*! \brief The buffer variable, if any */
+    Array<PrimExpr> buffer_indices;
+    ConstrSet cset;
+    /*! \brief The buffer ranges for pointer access */
+    Array<Range> buffer_ranges;
+    Var buffer = NullValue<Var>();
+    Buffer buffer_name;
+    /*! \brief The access data type */
+    DataType dtype;
+    /*! \brief The touched access range
+     *
+     * Has one IntSet for each index in the buffer being accessed.
+     */
+    Array<arith::IntSet> touched;
+    /*! \brief The type of access */
+    AccessType type;
+    /*! \brief The storage scope */
+    StorageScope scope;
+    /*! \brief Whether the access is pointer access */
+    bool is_pointer_access = false;
+    /*! \brief Whether this access originates from an async copy context
+     *         (e.g., inside a TMA load) and therefore multiple writes
+     *         among themselves should not force barriers between them. */
+    bool is_async_copy = false;
+  };
+  /*! \brief Access pattern about a single statement */
+  struct StmtEntry {
+    /*! \brief The statement */
+    const Object *stmt{};
+    /*! \brief access patterns in the statement */
+    std::vector<AccessEntry> access;
+  };
+  // access scope
+  std::vector<std::vector<StmtEntry>> scope_;
+  StorageScope GetScope(Var buffer_var) const {
+    return StorageScope::Create(GetPtrStorageScope(std::move(buffer_var)));
+  }
+  IterVar GetThreadVar(const std::string &tag) const {
+    for (const auto &iv : env_threads_) {
+      if (iv->thread_tag == tag) {
+        return iv;
       }
-      // Mutate after query, to avoid stmt change.
-      auto ret = StmtExprMutator::VisitStmt(stmt);
-      ret = SeqStmt({barrier, ret});
-      return ret;
-    } else {
-      return StmtExprMutator::VisitStmt(stmt);
     }
+    LOG(FATAL) << "Thread variable " << tag << " not found";
+    return IterVar();
   }
-  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    if (sync_scope_.rank == StorageRank::kGlobal &&
-        GetScope(op->buffer->data).rank == StorageRank::kGlobal) {
-      ++rw_stats_[op->buffer->data].read_count;
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    Var buf = op->buffer->data;
+    buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
+    StorageScope scope = GetScope(buf);
+    if (Enabled(buf.get(), scope)) {
+      ICHECK(allow_append_)
+          << tvm::ffi::GetRef<BufferLoad>(op) << " " << scope.to_string();
+      AccessEntry e{.cset = {constr_stack_}};
+      e.threads = env_threads();
+      e.buffer = buf;
+      e.buffer_name = op->buffer;
+      e.buffer_indices = op->indices;
+      e.dtype = op->dtype.element_of();
+      for (const auto &index : op->indices) {
+        e.touched.push_back(arith::IntSet::Vector(index));
+      }
+      e.type = kRead;
+      e.scope = scope;
+      curr_stmt_.access.emplace_back(std::move(e));
     }
-    return StmtExprMutator::VisitExpr_(op);
+    // traverse child
+    ConstrVisitor::VisitExpr_(op);
   }
-  Stmt VisitStmt_(const BufferStoreNode *op) final {
-    if (sync_scope_.rank == StorageRank::kGlobal &&
-        GetScope(op->buffer->data).rank == StorageRank::kGlobal) {
-      ++rw_stats_[op->buffer->data].write_count;
+  void VisitStmt_(const BufferStoreNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+
+    Var buf = op->buffer->data;
+    buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
+    StorageScope scope = GetScope(buf);
+    if (Enabled(buf.get(), scope)) {
+      AccessEntry e{.cset = {constr_stack_}};
+      e.threads = env_threads();
+      e.buffer = buf;
+      e.buffer_name = op->buffer;
+      e.buffer_indices = op->indices;
+      e.dtype = op->value.dtype().element_of();
+      for (const auto &index : op->indices) {
+        e.touched.push_back(arith::IntSet::Vector(index));
+      }
+      e.type = kWrite;
+      e.scope = scope;
+      curr_stmt_.access.emplace_back(std::move(e));
     }
-    return StmtExprMutator::VisitStmt_(op);
+    // traverse child
+    ConstrVisitor::VisitStmt_(op);
+    // push to the scope
+    scope_.back().push_back(curr_stmt_);
+    // clear access entry.
+    curr_stmt_.access.clear();
+    allow_append_ = false;
   }
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tvm::tir::attr::thread_extent) {
-      bool temp = true;
-      std::swap(temp, in_thread_env_);
-      thread_extents_.push_back(op);
-      Stmt ret = StmtExprMutator::VisitStmt_(op);
-      thread_extents_.pop_back();
-      std::swap(temp, in_thread_env_);
-      // first thread scope.
-      if (!in_thread_env_ && sync_scope_.rank == StorageRank::kGlobal) {
-        ret = InitGlobalBarrier(ret.as<AttrStmtNode>());
-        num_blocks_ = PrimExpr();
-        is_lead_ = PrimExpr();
+  void VisitStmt_(const EvaluateNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+    ConstrVisitor::VisitStmt_(op);
+    // push to the scope
+    if (!curr_stmt_.access.empty()) {
+      scope_.back().push_back(curr_stmt_);
+      curr_stmt_.access.clear();
+    }
+    allow_append_ = false;
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+    this->VisitExpr(op->value);
+    // push to the scope
+    scope_.back().push_back(curr_stmt_);
+    // clear access entry.
+    curr_stmt_.access.clear();
+    allow_append_ = false;
+    // traverse body block
+    {
+      auto guard = MakeGuard(op->var, op->value);
+      this->VisitStmt(op->body);
+    }
+  }
+  void VisitStmt_(const BlockNode *op) final {
+    auto block = Downcast<Block>(op);
+    for (const auto &buffer : block->alloc_buffers) {
+      ICHECK(buffer->IsInstance<BufferNode>());
+      buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    ConstrVisitor::VisitStmt_(op);
+  }
+  void VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == tvm::tir::attr::coproc_scope) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      env_threads_.push_back(iv);
+      ConstrVisitor::VisitStmt_(op);
+      env_threads_.pop_back();
+    } else if (op->attr_key == tvm::tir::attr::thread_extent) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      env_threads_.push_back(iv);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
+
+      if (!in_device_env_) {
+        in_device_env_ = true;
+        scope_.push_back(std::vector<StmtEntry>());
+        ConstrVisitor::VisitStmt_(op);
+        // no need to take the result as the thread barrier automatically syncs.
+        Summarize(std::move(scope_.back()), nullptr);
+        in_device_env_ = false;
+        scope_.pop_back();
+      } else {
+        ConstrVisitor::VisitStmt_(op);
       }
-      return ret;
+      env_threads_.pop_back();
     } else {
-      return StmtExprMutator::VisitStmt_(op);
+      ConstrVisitor::VisitStmt_(op);
     }
   }
 
-  PrimExpr VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(builtin::tvm_access_ptr())) {
-      PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-      op = expr.as<CallNode>();
-      ICHECK_EQ(op->args.size(), 5U);
-      Var buffer_var(Downcast<Var>(op->args[1]));
-      const IntImmNode *flag = op->args[4].as<IntImmNode>();
-      if ((flag->value & 1) && sync_scope_.rank == StorageRank::kGlobal &&
-          GetScope(buffer_var).rank == StorageRank::kGlobal) {
-        ++rw_stats_[buffer_var].read_count;
+  void VisitStmt_(const ForNode *op) final {
+    scope_.push_back(std::vector<StmtEntry>());
+    ConstrVisitor::VisitStmt_(op);
+    StmtEntry s;
+    s.stmt = op;
+    s.access = Summarize(std::move(scope_.back()), op);
+    scope_.pop_back();
+    if (!s.access.empty()) {
+      // relax the touched set to contain all ranges in the loop.
+      std::unordered_map<const VarNode *, arith::IntSet> relax_map;
+      relax_map[op->loop_var.get()] =
+          arith::IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
+      for (AccessEntry &e : s.access) {
+        if (e.buffer.defined()) {
+          ICHECK(!e.touched.empty());
+          Array<arith::IntSet> new_touched;
+          for (const auto &touched : e.touched) {
+            new_touched.push_back(arith::EvalSet(touched, relax_map));
+          }
+          e.touched = std::move(new_touched);
+        }
       }
-      if (flag->value & 2 && sync_scope_.rank == StorageRank::kGlobal &&
-          GetScope(buffer_var).rank == StorageRank::kGlobal) {
-        ++rw_stats_[buffer_var].write_count;
+    }
+    if (!s.access.empty()) {
+      scope_.back().emplace_back(std::move(s));
+    }
+  }
+
+  /**
+   * @brief Visit an IfThenElse statement and collect storage access summaries
+   * for its branches.
+   *
+   * Visits the if-then-else node's condition and both branches to summarize
+   * buffer reads, writes, and synchronization events under the condition's
+   * constraints.
+   *
+   * IMPORTANT: If syncs are inserted inside an if-statement with a non-uniform
+   * condition (i.e., the condition depends on threadIdx), we must hoist the
+   * sync to before the if-statement. Otherwise, only some threads will reach
+   * the sync point, causing a deadlock.
+   */
+  void VisitStmt_(const IfThenElseNode *op) final {
+    StmtEntry s;
+    // Track syncs inserted before visiting the if body
+    std::unordered_set<const Object *> syncs_before_then;
+    std::unordered_set<const Object *> syncs_before_else;
+    for (const auto &sync : syncs_inserted_) {
+      syncs_before_then.insert(sync);
+    }
+
+    {
+      auto guard = MakeGuard(op->condition);
+      allow_append_ = true;
+      this->VisitExpr(op->condition);
+
+      // Preserve accesses collected from the condition expression so they
+      // participate in dependency analysis. Otherwise, a write to shared memory
+      // immediately followed by an if-condition reading that memory would not
+      // trigger a sync before the if-statement.
+      std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
+      allow_append_ = false;
+
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->then_case);
       }
-      return expr;
-    } else if (op->op.same_as(builtin::address_of())) {
-      PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-      op = expr.as<CallNode>();
-      ICHECK_EQ(op->args.size(), 1U)
-          << "address_of should only have one argument (Buffer)";
 
+      s.stmt = op;
+      s.access = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      // Merge the condition's access summary into the if-statement's access
+      // list so the planner can insert a sync before the if when necessary.
+      if (!cond_access.empty()) {
+        s.access.insert(s.access.begin(), cond_access.begin(),
+                        cond_access.end());
+      }
+    }
+
+    // Track syncs inserted after visiting then branch
+    for (const auto &sync : syncs_inserted_) {
+      syncs_before_else.insert(sync);
+    }
+
+    if (op->else_case) {
+      auto guard = MakeGuard(tir::Not(op->condition));
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->else_case.value());
+      }
+      auto v = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      s.access.insert(s.access.end(), v.begin(), v.end());
+    }
+
+    // Check if any syncs were inserted inside the if-then-else
+    std::vector<const Object *> syncs_in_then;
+    std::vector<const Object *> syncs_in_else;
+
+    for (const auto &sync : syncs_inserted_) {
+      if (syncs_before_then.count(sync) == 0 &&
+          syncs_before_else.count(sync) != 0) {
+        // Sync was inserted during then branch processing
+        syncs_in_then.push_back(sync);
+      } else if (syncs_before_else.count(sync) == 0) {
+        // Sync was inserted during else branch processing
+        syncs_in_else.push_back(sync);
+      }
+    }
+
+    bool has_syncs_inside = !syncs_in_then.empty() || !syncs_in_else.empty();
+
+    if (has_syncs_inside) {
+      // Check if the condition depends on runtime values (e.g., shared memory
+      // loads). If so, we cannot determine at compile time how many threads
+      // will enter the if, so we must hoist the sync to before the if to avoid
+      // potential deadlock.
+      //
+      // If the condition only depends on threadIdx (e.g., `threadIdx.x >=
+      // 512`), we use Z3 to check if the thread count is a multiple of 32.
+      // If not, ThreadPartialSyncRewriter cannot handle it properly, so we
+      // must also hoist the sync.
+      arith::Analyzer analyzer;
+      ConstrSet constr_set = GetConstrSet();
+      constr_set.Populate(analyzer);
+      RuntimeDependentConditionChecker checker(&analyzer, warp_size_);
+      IterVar tx = GetThreadVar("threadIdx.x");
+      bool depends_on_runtime =
+          checker.DependsOnRuntimeValue(op->condition, tx);
+
+      if (depends_on_runtime) {
+        // Condition depends on runtime values - must hoist sync
+        // Condition depends on runtime values - must hoist sync
+        LOG(WARNING)
+            << "[ThreadSync] Hoisting sync from inside if to before if. "
+            << "Condition depends on runtime value: " << op->condition;
+        for (const auto &sync : syncs_in_then) {
+          syncs_inserted_.erase(sync);
+        }
+        for (const auto &sync : syncs_in_else) {
+          syncs_inserted_.erase(sync);
+        }
+
+        // Insert sync before the if-statement itself
+        insert_syncs(op);
+      }
+    }
+
+    scope_.back().emplace_back(std::move(s));
+  }
+
+  void VisitStmt_(const WhileNode *op) final {
+    StmtEntry s;
+    {
+      auto guard = MakeGuard(op->condition);
+      allow_append_ = true;
+      this->VisitExpr(op->condition);
+      std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
+      allow_append_ = false;
+
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->body);
+      }
+      s.stmt = op;
+      s.access = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      if (!cond_access.empty()) {
+        s.access.insert(s.access.begin(), cond_access.begin(),
+                        cond_access.end());
+      }
+    }
+    scope_.back().emplace_back(std::move(s));
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Mark async TMA load context so that tvm_access_ptr within the call
+    // can be tagged accordingly.
+    auto is_tma_load = [&]() {
+      if (auto opt = op->op.as<Op>()) {
+        const Op &call_op = opt.value();
+        return call_op.same_as(tl::tma_load()) ||
+               call_op.same_as(tl::tma_load_im2col());
+      }
+      return false;
+    }();
+    if (is_tma_load) {
+      tma_depth_++;
+      for (const auto &a : op->args) {
+        this->VisitExpr(a);
+      }
+      tma_depth_--;
+      return;
+    }
+    if (op->op.same_as(builtin::address_of())) {
+      ICHECK_EQ(op->args.size(), 1U);
       if (auto load = op->args[0].as<BufferLoadNode>()) {
-        Var buffer_var(Downcast<Var>(load->buffer->data));
-        if (sync_scope_.rank == StorageRank::kGlobal &&
-            GetScope(buffer_var).rank == StorageRank::kGlobal) {
-          ++rw_stats_[buffer_var].read_count;
+        Buffer buffer = load->buffer;
+        DataType dtype = buffer->dtype;
+        const VarNode *buffer_var = buffer->data.as<VarNode>();
+        buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buffer_var), buffer);
+        StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
+        Array<Range> buffer_ranges;
+        // from indices to buffer indices
+        ICHECK(buffer->shape.size() == load->indices.size());
+        // Use buffer shape and indices to compute the buffer_ranges for each
+        // dimension.
+        for (size_t i = 0; i < buffer->shape.size(); ++i) {
+          PrimExpr min = load->indices[i];
+          PrimExpr extent = make_const(buffer->shape[i].dtype(), 1);
+          buffer_ranges.push_back(Range::FromMinExtent(min, extent));
         }
-        if (sync_scope_.rank == StorageRank::kGlobal &&
-            GetScope(buffer_var).rank == StorageRank::kGlobal) {
-          ++rw_stats_[buffer_var].write_count;
+        if (Enabled(buffer_var, scope)) {
+          ICHECK(allow_append_);
+          AccessEntry e{.cset = {constr_stack_}};
+          e.threads = env_threads();
+          e.dtype = dtype;
+          e.buffer = Downcast<Var>(buffer->data);
+          e.buffer_name = buffer;
+          e.buffer_ranges = buffer_ranges;
+          for (const auto &index : load->indices) {
+            e.touched.push_back(arith::IntSet::Vector(index));
+          }
+          e.is_pointer_access = true;
+          e.type = kRead;
+          e.scope = scope;
+          curr_stmt_.access.emplace_back(e);
         }
-        return expr;
+        ConstrVisitor::VisitExpr_(load);
       } else {
-        return StmtExprMutator::VisitExpr_(op);
+        ConstrVisitor::VisitExpr_(op);
+      }
+    } else if (op->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK_EQ(op->args.size(), 5U);
+      DataType dtype = op->args[0].dtype();
+      const VarNode *buffer_var = op->args[1].as<VarNode>();
+      PrimExpr offset = op->args[2];
+      PrimExpr extent = op->args[3];
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
+      // The buffer scope.
+      if (Enabled(buffer_var, scope)) {
+        ICHECK(allow_append_);
+        Array<Range> buffer_ranges;
+        if (buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(buffer_var)) ==
+            buffer_data_to_buffer_.end()) {
+          // cannot find buffer map, use the default buffer
+          buffer_ranges = {Range::FromMinExtent(offset, extent)};
+        } else {
+          Buffer buffer =
+              buffer_data_to_buffer_.at(tvm::ffi::GetRef<Var>(buffer_var));
+          auto buffer_shape = buffer->shape;
+          // convert 1d offset to multi-dimensional index
+          auto linear_to_indices = [this](PrimExpr offset,
+                                          const Array<PrimExpr> &shape) {
+            Array<PrimExpr> indices;
+            PrimExpr remaining = std::move(offset);
+            for (size_t i = 0; i < shape.size(); ++i) {
+              PrimExpr stride = make_const(DataType::Int(32), 1);
+              for (size_t j = i + 1; j < shape.size(); ++j) {
+                stride = stride * shape[j];
+              }
+              PrimExpr idx = FloorDiv(remaining, stride);
+              remaining = FloorMod(remaining, stride);
+              indices.push_back(idx);
+            }
+            return indices;
+          };
+          Array<PrimExpr> start_indices =
+              linear_to_indices(offset, buffer_shape);
+          Array<PrimExpr> end_indices =
+              linear_to_indices(offset + extent, buffer_shape);
+          for (size_t i = 0; i < buffer_shape.size(); ++i) {
+            buffer_ranges.push_back(Range::FromMinExtent(
+                start_indices[i], end_indices[i] - start_indices[i]));
+          }
+        }
+        AccessEntry e{.cset = {constr_stack_}};
+        e.threads = env_threads();
+        e.dtype = dtype;
+        e.buffer = tvm::ffi::GetRef<Var>(buffer_var);
+        e.buffer_ranges = buffer_ranges;
+        e.is_pointer_access = true;
+        e.touched = {
+            arith::IntSet::FromRange(Range::FromMinExtent(offset, extent))};
+        e.scope = scope;
+        if (flag->value & 1) {
+          e.type = kRead;
+          e.is_async_copy = (tma_depth_ > 0);
+          curr_stmt_.access.emplace_back(e);
+        }
+        if (flag->value & 2) {
+          e.type = kWrite;
+          e.is_async_copy = (tma_depth_ > 0);
+          curr_stmt_.access.emplace_back(e);
+        }
+      }
+      ConstrVisitor::VisitExpr_(op);
+    } else if (op->op.same_as(builtin::tvm_storage_sync())) {
+      ICHECK(allow_append_);
+      const std::string &s = op->args[0].as<StringImmNode>()->value;
+      if (s != "warp") {
+        StorageScope scope = StorageScope::Create(s);
+        AccessEntry e{.cset = {constr_stack_}};
+        e.threads = env_threads();
+        e.type = kSync;
+        e.scope = StorageScope::Create(s);
+        curr_stmt_.access.emplace_back(std::move(e));
       }
     } else {
-      return StmtExprMutator::VisitExpr_(op);
+      ConstrVisitor::VisitExpr_(op);
     }
   }
 
-private:
-  // RW statistics about data
-  struct Entry {
-    int read_count{0};
-    int write_count{0};
-  };
-
-  // Get current storage scope.
-  StorageScope GetScope(Var buffer_var) const {
-    return StorageScope::Create(GetPtrStorageScope(std::move(buffer_var)));
+  void SetBufferDataToBuffer(const Var &buffer_var, const Buffer &buffer) {
+    buffer_data_to_buffer_.Set(buffer_var, buffer);
   }
 
-  // private functions.
-  Stmt InitGlobalBarrier(const AttrStmtNode *op) {
-    ICHECK(op != nullptr);
-    Array<PrimExpr> pargs = {
-        StringImm(runtime::symbol::tvm_prepare_global_barrier)};
-    Stmt prep =
-        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
-    Stmt body = op->body;
-    for (const auto &kv : rw_stats_) {
-      const auto &e = kv.second;
-      if (e.read_count != 0 && e.write_count != 0) {
-        body = AttrStmt(kv.first, tvm::tir::attr::volatile_scope, 1, body);
+  std::vector<AccessEntry> Summarize(std::vector<StmtEntry> seq,
+                                     const ForNode *loop) {
+    // Redirect all "shared.dyn" buffer access to the same buffer var
+    // so that the accesses can be planned together.
+    Var shared_dyn_buf;
+    for (StmtEntry &entry : seq) {
+      for (AccessEntry &access : entry.access) {
+        if (access.scope.rank == StorageRank::kShared &&
+            access.scope.tag == ".dyn" && access.buffer.defined()) {
+          if (!shared_dyn_buf.defined()) {
+            shared_dyn_buf = access.buffer;
+          } else {
+            access.buffer = shared_dyn_buf;
+          }
+        }
       }
     }
-    rw_stats_.clear();
-    Stmt kinit = Evaluate(
-        Call(DataType::Int(32), builtin::tvm_global_barrier_kinit(), {}));
-    body = SeqStmt({kinit, body});
-    body = AttrStmt(op->node, op->attr_key, op->value, body);
-    return SeqStmt({prep, body});
-  }
-  Stmt MakeGlobalBarrier() {
-    ICHECK(sync_scope_.rank == StorageRank::kGlobal);
-    if (!num_blocks_.defined()) {
-      ICHECK(!is_lead_.defined());
-      num_work_dim_ = thread_extents_.size();
-      for (const AttrStmtNode *attr : thread_extents_) {
-        IterVar iv = Downcast<IterVar>(attr->node);
-        runtime::ThreadScope s = runtime::ThreadScope::Create(iv->thread_tag);
-        if (s.rank == 0) {
-          num_blocks_ =
-              (num_blocks_.defined() ? attr->value * num_blocks_ : attr->value);
-        } else if (s.rank == 1) {
-          PrimExpr cond = iv->var == make_zero(iv->var.dtype());
-          is_lead_ = is_lead_.defined() ? (is_lead_ && cond) : cond;
+
+    // Unsynced reads and writes
+    std::vector<AccessEntry> reads;
+    std::vector<AccessEntry> writes;
+    // if it is a loop, rotate two times to consider effect of loop.
+    // simulation based approach to find dependencies
+    for (size_t i = 0; i < seq.size(); ++i) {
+      const StmtEntry &s = seq[i];
+      // check if sync before statement is needed.
+      bool sync_before_stmt = (syncs_inserted_.count(s.stmt) != 0);
+      // Apply the syncs added already.
+
+      if (sync_before_stmt) {
+        reads.clear();
+        writes.clear();
+      }
+
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kRead) {
+          // Same-iteration conflict: loop=nullptr
+          if (FindConflict(writes, acc, nullptr)) {
+            sync_before_stmt = true;
+            break;
+          }
+        } else if (acc.type == kWrite) {
+          // Same-iteration conflict: loop=nullptr
+          if (FindConflict(reads, acc, nullptr) ||
+              FindConflict(writes, acc, nullptr)) {
+            sync_before_stmt = true;
+            break;
+          }
+        } else if (acc.type == kSync) {
+          reads.clear();
+          writes.clear();
+        }
+      }
+      // If sync is inserted. remove the irrelevant things.
+      if (sync_before_stmt) {
+        reads.clear();
+        writes.clear();
+      }
+      // Add the read/write of current statement
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kRead) {
+          reads.push_back(acc);
+        } else if (acc.type == kWrite) {
+          writes.push_back(acc);
+        } else if (acc.type == kSync) {
+          reads.clear();
+          writes.clear();
+        }
+      }
+
+      if (sync_before_stmt) {
+        insert_syncs(s.stmt);
+      }
+    }
+    if (loop != nullptr) {
+      // Check if the loop body contains any reads in the same sync scope.
+      // If there are reads, we conservatively keep the sync within the loop
+      // body to preserve per-iteration ordering when needed. If there are no
+      // reads (e.g., only writes to shared.dyn), we can safely hoist the sync
+      // to before the loop to avoid redundant barriers.
+      bool has_read_in_scope = false;
+      for (const StmtEntry &s : seq) {
+        for (const AccessEntry &acc : s.access) {
+          if (acc.type == kRead && acc.scope == sync_scope_) {
+            has_read_in_scope = true;
+            break;
+          }
+        }
+        if (has_read_in_scope)
+          break;
+      }
+      // Loop-carried dependency analysis using symbolic iteration shift.
+      // We compare accesses at iteration i (end of loop, stored in
+      // reads/writes) with accesses at iteration i+1 (beginning of next
+      // iteration). By substituting loop_var -> loop_var + step in the "next
+      // iteration" indices, we can precisely determine if there's a true
+      // dependency.
+      //
+      // Examples:
+      // - A[i] write, A[i] read: No loop-carry (same iteration access)
+      // - A[i] write, A[i+1] read: After shift, comparing A[i] vs A[i+1],
+      // disjoint
+      // - A[i] write, A[i-1] read: After shift, comparing A[i] vs A[i],
+      // conflict!
+      // - A[i%2] write, A[i%2] read: After shift, comparing A[i%2] vs
+      // A[(i+1)%2],
+      //   which are disjoint for modulo buffering
+      for (size_t i = 0; i < seq.size(); ++i) {
+        const StmtEntry &s = seq[i];
+        if (syncs_inserted_.count(s.stmt) != 0)
+          break;
+        if (reads.empty() && writes.empty())
+          break;
+        bool need_loop_sync = false;
+        for (const AccessEntry &acc : s.access) {
+          if (acc.type == kRead) {
+            // Loop-carry conflict: pass loop for iteration shift analysis
+            if (FindConflict(writes, acc, loop)) {
+              need_loop_sync = true;
+              break;
+            }
+          } else if (acc.type == kWrite) {
+            // Loop-carry conflict: pass loop for iteration shift analysis
+            if (FindConflict(reads, acc, loop) ||
+                FindConflict(writes, acc, loop)) {
+              need_loop_sync = true;
+              break;
+            }
+          } else if (acc.type == kSync) {
+            reads.clear();
+            writes.clear();
+          }
+        }
+        if (need_loop_sync) {
+          if (!has_read_in_scope) {
+            // Mark the loop itself to receive a sync before it, instead of
+            // inserting inside the loop body. This ensures a single sync is
+            // emitted outside the loop and avoids per-iteration overhead.
+            insert_syncs(loop);
+          } else {
+            // Fall back to inserting before the first conflicting statement
+            // inside the loop to maintain correctness when reads are present.
+            insert_syncs(s.stmt);
+          }
+          break;
+        }
+      }
+    }
+    // return the exposed entries, remove unnecessary ones.
+    int sync_count = 0;
+    // head are before first sync, tail are after last sync
+    std::vector<AccessEntry> head, tail;
+    AccessEntry esync{.cset = {constr_stack_}};
+    esync.threads = this->env_threads();
+    esync.type = kSync;
+    esync.scope = sync_scope_;
+
+    for (const StmtEntry &s : seq) {
+      if (syncs_inserted_.count(s.stmt)) {
+        if (sync_count != 0) {
+          tail.clear();
+        } else {
+          head.push_back(esync);
+        }
+        ++sync_count;
+      }
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kSync) {
+          if (sync_count != 0) {
+            tail.clear();
+          } else {
+            head.push_back(esync);
+          }
+          ++sync_count;
+        } else {
+          if (sync_count != 0) {
+            tail.push_back(acc);
+          } else {
+            head.push_back(acc);
+          }
         }
       }
-    } else {
-      ICHECK_EQ(num_work_dim_, thread_extents_.size());
     }
-    return Evaluate(
-        Call(DataType::Int(32), builtin::tvm_storage_sync(),
-             {StringImm(sync_scope_.to_string()), is_lead_, num_blocks_}));
+    head.insert(head.end(), tail.begin(), tail.end());
+    return head;
   }
-  // data structure.
-  StorageScope sync_scope_;
-  const std::unordered_set<const Object *> &syncs_;
+  // The syncs inserted before each statement
+  std::unordered_set<const Object *> syncs_inserted_;
+  const Array<IterVar> &env_threads() const { return env_threads_; }
 
-  // The read write statistics of storage
-  std::unordered_map<Var, Entry, ObjectPtrHash, ObjectPtrEqual> rw_stats_;
-  // The statistics for global barrier
-  bool in_thread_env_{false};
-  // memorized results
-  std::vector<const AttrStmtNode *> thread_extents_;
-  size_t num_work_dim_{0};
-  PrimExpr num_blocks_;
-  PrimExpr is_lead_;
-};
+private:
+  bool Enabled(const VarNode *buf, const StorageScope &scope) {
+    return in_device_env() && scope == sync_scope_;
+  }
+  /*! \return whether we are in device environment. */
+  bool in_device_env() const { return in_device_env_; }
+
+  // whether access appending is enabled.
+  bool allow_append_{false};
+  // Whether we are in device environment
+  bool in_device_env_{false};
+  // Nesting depth of tma_load/tma_load_im2col calls
+  int tma_depth_{0};
+  // the current free stmt entry.
+  StmtEntry curr_stmt_;
+  // The involving threads
+  Array<IterVar> env_threads_;
+  // The buffer map
+  Map<Var, Buffer> buffer_data_to_buffer_;
+  // synchronization scope
+  StorageScope sync_scope_;
+  // warp size from target
+  int warp_size_;
 
-class ThreadPartialSyncRewriter : public IRMutatorWithAnalyzer {
-public:
-  static Stmt Rewrite(Stmt stmt) {
+  void insert_syncs(const Object *obj) {
+    if (syncs_inserted_.count(obj))
+      return;
+    syncs_inserted_.insert(obj);
+  }
+  bool PointerAccessIsDisjoint(const AccessEntry &lhs, const AccessEntry &rhs) {
+    if (lhs.touched.size() != 1 || rhs.touched.size() != 1) {
+      return false;
+    }
+    ConstrSet prev_cset{lhs.cset};
+    ConstrSet curr_cset{rhs.cset};
     arith::Analyzer analyzer;
-    ThreadPartialSyncRewriter rewriter(&analyzer);
-    return rewriter(std::move(stmt));
+
+    struct ThreadVarInfo {
+      const char *name_prev;
+      const char *name_curr;
+    } thread_vars[] = {
+        {"tx1", "tx2"},
+        {"ty1", "ty2"},
+        {"tz1", "tz2"},
+    };
+    PrimExpr lhs_min = analyzer.Simplify(lhs.touched[0].min());
+    PrimExpr lhs_max = analyzer.Simplify(lhs.touched[0].max());
+    PrimExpr rhs_min = analyzer.Simplify(rhs.touched[0].min());
+    PrimExpr rhs_max = analyzer.Simplify(rhs.touched[0].max());
+    for (unsigned idx = 0; idx != 3; ++idx) {
+      auto &info = thread_vars[idx];
+      Var old_prev_var = lhs.threads[lhs.threads.size() + idx - 3]->var;
+      Var old_curr_var = rhs.threads[rhs.threads.size() + idx - 3]->var;
+      Var prev_var(info.name_prev, old_prev_var.dtype());
+      Var curr_var(info.name_curr, old_curr_var.dtype());
+      lhs_min = Substitute(lhs_min, {{old_prev_var, prev_var}});
+      lhs_max = Substitute(lhs_max, {{old_prev_var, prev_var}});
+      prev_cset = prev_cset.Substitute({{old_prev_var, prev_var}});
+      rhs_min = Substitute(rhs_min, {{old_curr_var, curr_var}});
+      rhs_max = Substitute(rhs_max, {{old_curr_var, curr_var}});
+      curr_cset = curr_cset.Substitute({{old_curr_var, curr_var}});
+    }
+    prev_cset.Populate(analyzer);
+    curr_cset.Populate(analyzer);
+
+    if (analyzer.CanProve(lhs_max < rhs_min,
+                          arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    if (analyzer.CanProve(rhs_max < lhs_min,
+                          arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    return false;
   }
+  void print_access_tentry(const AccessEntry &access,
+                           bool print_constr = false) {
+    std::ostringstream output;
+
+    output << "Access Entry Information:\n";
+    output << "  Buffer: " << access.buffer << "\n";
+    output << "  Buffer Name: " << access.buffer_name << "\n";
+    output << "  Data Type: " << access.dtype << "\n";
+
+    std::string type_str;
+    switch (access.type) {
+    case kRead:
+      type_str = "Read";
+      break;
+    case kWrite:
+      type_str = "Write";
+      break;
+    case kSync:
+      type_str = "Sync";
+      break;
+    case kAlloc:
+      type_str = "Alloc";
+      break;
+    case kReadAcquire:
+      type_str = "ReadAcquire";
+      break;
+    default:
+      type_str = "Unknown";
+      break;
+    }
+    output << "  Access Type: " << type_str << "\n";
 
-private:
-  explicit ThreadPartialSyncRewriter(arith::Analyzer *analyzer)
-      : IRMutatorWithAnalyzer(analyzer) {}
+    output << "  Storage Scope: " << access.scope.to_string() << "\n";
 
-  Stmt VisitStmt_(const EvaluateNode *op) final {
-    const CallNode *call = nullptr;
-    if (op->value->IsInstance<CallNode>()) {
-      call = op->value.as<CallNode>();
-      if (call->op.same_as(builtin::tvm_storage_sync())) {
-        const auto &args = call->args;
-        ICHECK(!args.empty());
-        const auto *scope_node = args[0].as<StringImmNode>();
-        ICHECK(scope_node != nullptr);
-        const std::string &scope = scope_node->value;
+    output << "  Threads: [";
+    for (size_t i = 0; i < access.threads.size(); ++i) {
+      if (i > 0)
+        output << ", ";
+      output << access.threads[i]->thread_tag;
+    }
+    output << "]\n";
+
+    if (print_constr) {
+      output << "  Constraint: {";
+      arith::Analyzer analyzer_;
+      access.cset.Populate(analyzer_);
+      output << analyzer_.z3_prover.GetSMTLIB2(std::nullopt);
+      output << "}\n";
+    }
 
-        if (args.size() != 1 || (scope != "shared" && scope != "shared.dyn")) {
-          return IRMutatorWithAnalyzer::VisitStmt_(op);
-        }
+    output << "  Buffer Indices: [";
+    for (size_t i = 0; i < access.buffer_indices.size(); ++i) {
+      if (i > 0)
+        output << ", ";
+      output << access.buffer_indices[i];
+    }
+    output << "]\n";
+
+    if (!access.buffer_ranges.empty()) {
+      output << "  Buffer Ranges: [";
+      for (size_t i = 0; i < access.buffer_ranges.size(); ++i) {
+        if (i > 0)
+          output << ", ";
+        output << "[" << access.buffer_ranges[i]->min << ", "
+               << access.buffer_ranges[i]->extent << "]";
+      }
+      output << "]\n";
+    }
 
-        return ProcessSharedSync(call, scope);
+    if (!access.touched.empty()) {
+      output << "  Touched Ranges: [";
+      for (size_t i = 0; i < access.touched.size(); ++i) {
+        if (i > 0)
+          output << ", ";
+        output << access.touched[i];
       }
+      output << "]\n";
     }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
+
+    output << "  Flags: ";
+    output << "is_pointer_access="
+           << (access.is_pointer_access ? "true" : "false");
+    output << ", is_async_copy=" << (access.is_async_copy ? "true" : "false");
+
+    LOG(WARNING) << output.str();
   }
+  /*!
+   * \brief Check if two access entries conflict, considering loop-carried
+   * dependencies.
+   *
+   * For loop-carry analysis, we use symbolic iteration shift: instead of
+   * treating loop_carry as a simple flag, we substitute loop_var with
+   * loop_var + step in the "next iteration" access indices and check if they
+   * overlap with the "current iteration" access indices.
+   *
+   * This approach can prove that accesses like A[i] and A[i+1] are disjoint
+   * (no loop-carry dependency), while correctly detecting dependencies like
+   * A[i] and A[i-1] (loop-carry dependency with distance 1).
+   *
+   * \param prev The access entry from the previous/current iteration
+   * \param curr The access entry to check against
+   * \param loop The loop node for loop-carry analysis, nullptr for
+   * same-iteration
+   * \return true if the accesses conflict and need synchronization
+   */
+  bool FindConflict(const AccessEntry &prev, const AccessEntry &curr,
+                    const ForNode *loop) {
+    // Special case: ignore conflicts between async-copy writes (e.g., TMA
+    // loads into shared memory). Multiple async writes do not require
+    // interspersed barriers among themselves. We still respect conflicts with
+    // reads to ensure visibility before consumption.
+    if (prev.type == kWrite && curr.type == kWrite && prev.is_async_copy &&
+        curr.is_async_copy) {
+      return false;
+    }
+    // Access to different buffers does not conflict.
+    if (!prev.buffer.same_as(curr.buffer)) {
+      return false;
+    }
 
-  Stmt ProcessSharedSync(const CallNode *op, const std::string &scope) {
-    // Get thread bounds
-    auto bound_tx = analyzer_->const_int_bound(tx_);
-    auto bound_ty = analyzer_->const_int_bound(ty_);
-    auto bound_tz = analyzer_->const_int_bound(tz_);
+    if (prev.buffer_indices.size() != curr.buffer_indices.size()) {
+      // They are not the same indices, should be conflict.
+      return true;
+    }
 
-    // Check if all threads are participating (full extent)
-    if (IsFullThreadExtent(tx_, bound_tx) &&
-        IsFullThreadExtent(ty_, bound_ty) &&
-        IsFullThreadExtent(tz_, bound_tz)) {
-      return Evaluate(IRMutatorWithAnalyzer::VisitExpr_(op));
+    if (prev.is_pointer_access || curr.is_pointer_access) {
+      // For accesses created via tvm_access_ptr we may still be able to prove
+      // disjointness using their byte ranges. If both sides expose a touched
+      // interval and we can show they don't overlap, skip the conflict.
+      if (prev.is_pointer_access && curr.is_pointer_access &&
+          PointerAccessIsDisjoint(prev, curr)) {
+        return false;
+      }
+      // Otherwise fall back to the conservative answer: treat them as
+      // overlapping.
+      return true;
     }
 
-    // Calculate thread extents
-    auto extent_tx = CalculateThreadExtent(tx_, bound_tx);
-    auto extent_ty = CalculateThreadExtent(ty_, bound_ty);
-    auto extent_tz = CalculateThreadExtent(tz_, bound_tz);
+    // Build substitution map for loop-carry analysis
+    // For loop-carry, we compare: Iter(i) vs Iter(i+step)
+    // prev represents access at iteration i (end of loop body)
+    // curr represents access at iteration i+step (beginning of next iteration)
+    ffi::Map<Var, PrimExpr> loop_shift_sub;
+    if (loop != nullptr) {
+      // Get loop step, default to 1 if not specified
+      PrimExpr step = make_const(loop->loop_var.dtype(), 1);
+      // Substitute loop_var -> loop_var + step for the "next iteration"
+      loop_shift_sub.Set(loop->loop_var, loop->loop_var + step);
+    }
 
-    // Create or get barrier info
-    ThreadBoundKey key{bound_tx->min_value, bound_tx->max_value,
-                       bound_ty->min_value, bound_ty->max_value,
-                       bound_tz->min_value, bound_tz->max_value};
+    // Check if indices are the same (considering loop shift)
+    bool has_same_index = true;
+    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
+      const auto &prev_indice = prev.buffer_indices[i];
+      PrimExpr curr_indice = curr.buffer_indices[i];
 
-    auto [barrier_id, thread_count] =
-        GetOrCreateBarrier(key, extent_tx, extent_ty, extent_tz);
-    if (thread_count % 32 != 0) {
-      // TODO(lei): This is a workaround for the case where the thread count is
-      // not a multiple of 32. we should enhance the pass to analysis index
-      // instead of buffer expression etc.
-      return Stmt();
+      // For loop-carry, shift the curr index to represent next iteration
+      if (loop != nullptr) {
+        curr_indice = Substitute(curr_indice, loop_shift_sub);
+      }
+
+      if (!ExprDeepEqual()(prev_indice, curr_indice)) {
+        has_same_index = false;
+        break;
+      }
     }
 
-    // Create new sync call with barrier info
-    Array<PrimExpr> new_args = {StringImm(scope),
-                                IntImm(DataType::Int(32), barrier_id),
-                                IntImm(DataType::Int(32), thread_count)};
-    return Evaluate(Call(op->dtype, op->op, new_args));
-  }
+    if (has_same_index) {
+      // Use Z3 to check if prev and curr constraints are equivalent.
+      // If equivalent, the same set of threads execute both accesses, so no
+      // sync is needed.
+      PrimExpr prev_constr = prev.cset.ToConjunction();
+      PrimExpr curr_constr = curr.cset.ToConjunction();
+
+      arith::Analyzer analyzer;
+      for (const auto &iv : prev.threads) {
+        if (iv->dom.defined()) {
+          analyzer.Bind(iv->var, iv->dom);
+        }
+      }
+      // Add loop variable constraint for loop-carry analysis
+      if (loop != nullptr) {
+        // For loop-carry analysis, we compare iteration i with iteration i+1.
+        // Since i+1 must be a valid iteration, i can only range from min to
+        // min+extent-2 (i.e., extent-1 valid pairs instead of extent).
+        PrimExpr adjusted_extent =
+            loop->extent - make_const(loop->extent.dtype(), 1);
+        analyzer.Bind(loop->loop_var,
+                      Range::FromMinExtent(loop->min, adjusted_extent));
+      }
 
-  std::pair<size_t, size_t> GetOrCreateBarrier(const ThreadBoundKey &key,
-                                               size_t extent_tx,
-                                               size_t extent_ty,
-                                               size_t extent_tz) {
-    if (barrier_id_map_.count(key)) {
-      return {barrier_id_map_[key], thread_count_map_[key]};
+      // Check P => C: ¬P ∨ C
+      bool prev_implies_curr = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(prev_constr), curr_constr));
+      // Check C => P: ¬C ∨ P
+      bool curr_implies_prev = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(curr_constr), prev_constr));
+
+      if (prev_implies_curr && curr_implies_prev) {
+        // If constraints are equivalent, they are not in conflict
+        return false;
+      } else {
+        // If constraints are not equivalent, they are in conflict
+        return true;
+      }
     }
 
-    size_t barrier_id =
-        barrier_id_map_.size() +
-        static_cast<size_t>(ReservedNamedBarriers::kFirstUsedBarrier);
-    size_t thread_count = extent_tx * extent_ty * extent_tz;
+    // Indices are different, need to check if they can overlap
+    bool range_is_overlap = true;
 
-    barrier_id_map_[key] = barrier_id;
-    thread_count_map_[key] = thread_count;
+    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
+      auto prev_dtype = prev.dtype;
+      auto curr_dtype = curr.dtype;
 
-    return {barrier_id, thread_count};
-  }
+      const auto &prev_indice = prev.buffer_indices[i];
+      PrimExpr curr_indice = curr.buffer_indices[i];
 
-  size_t CalculateThreadExtent(const IterVar &iv,
-                               const arith::ConstIntBound &bound) {
-    if (!analyzer_->const_int_bound.IsBound(iv->var)) {
-      return 1;
-    }
-    return bound->max_value - bound->min_value + 1;
-  }
+      // For loop-carry, shift the curr index to represent next iteration
+      if (loop != nullptr) {
+        curr_indice = Substitute(curr_indice, loop_shift_sub);
+      }
 
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tvm::tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (iv->thread_tag == "threadIdx.x") {
-        tx_ = iv;
-      } else if (iv->thread_tag == "threadIdx.y") {
-        ty_ = iv;
-      } else if (iv->thread_tag == "threadIdx.z") {
-        tz_ = iv;
+      PrimExpr prev_indice_bytes = prev_indice * prev_dtype.bytes();
+      PrimExpr curr_indice_bytes = curr_indice * curr_dtype.bytes();
+
+      ConstrSet prev_cset{prev.cset};
+      ConstrSet curr_cset{curr.cset};
+      arith::Analyzer analyzer;
+
+      // Add loop variable constraint for loop-carry analysis
+      if (loop != nullptr) {
+        // For loop-carry analysis, we compare iteration i with iteration i+1.
+        // Since i+1 must be a valid iteration, i can only range from min to
+        // min+extent-2 (i.e., extent-1 valid pairs instead of extent).
+        PrimExpr adjusted_extent =
+            loop->extent - make_const(loop->extent.dtype(), 1);
+        analyzer.Bind(loop->loop_var,
+                      Range::FromMinExtent(loop->min, adjusted_extent));
       }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
 
-  bool IsFullThreadExtent(const IterVar &iv,
-                          const arith::ConstIntBound &bound) {
-    if (!analyzer_->const_int_bound.IsBound(iv->var)) {
-      return true;
-    }
+      // For WAW (Write-after-Write) and RAR (Read-after-Read), we should use
+      // the same thread variables because:
+      // - WAW: doesn't create true data dependency, only need to check if the
+      //   same thread overwrites its own data across iterations
+      // - RAR: no dependency at all
+      // For RAW (Read-after-Write) and WAR (Write-after-Read), we need to use
+      // different thread variables to check cross-thread dependencies.
+      bool same_access_type = (prev.type == kWrite && curr.type == kWrite) ||
+                              (prev.type == kRead && curr.type == kRead);
+
+      PrimExpr thread_condition = Bool(false);
+      ffi::Map<Var, PrimExpr> prev_sub, curr_sub;
+
+      const char *thread_names[] = {"tx", "ty", "tz"};
+      for (unsigned idx = 0; idx != 3; ++idx) {
+        Var old_prev_var = prev.threads[prev.threads.size() + idx - 3]->var;
+        Var old_curr_var = curr.threads[curr.threads.size() + idx - 3]->var;
+
+        if (same_access_type) {
+          // For WAW/RAR: use a single shared Var object for both prev and curr
+          // This allows the analyzer to see they reference the same thread
+          Var shared_var(thread_names[idx], old_prev_var.dtype());
+          prev_sub.Set(old_prev_var, shared_var);
+          curr_sub.Set(old_curr_var, shared_var);
+        } else {
+          // For RAW/WAR: use different Var objects to model cross-thread access
+          Var prev_var(std::string(thread_names[idx]) + "1",
+                       old_prev_var.dtype());
+          Var curr_var(std::string(thread_names[idx]) + "2",
+                       old_curr_var.dtype());
+          thread_condition =
+              tir::Or(thread_condition, tir::NE(prev_var, curr_var));
+          prev_sub.Set(old_prev_var, prev_var);
+          curr_sub.Set(old_curr_var, curr_var);
+        }
+      }
+      if (!same_access_type) {
+        analyzer.EnterConstraint(thread_condition);
+      }
+      prev_cset.Substitute(prev_sub).Populate(analyzer);
+      curr_cset.Substitute(curr_sub).Populate(analyzer);
+      bool provably_disjoint = false;
+
+      prev_indice_bytes =
+          analyzer.Simplify(Substitute(prev_indice_bytes, prev_sub));
+      curr_indice_bytes =
+          analyzer.Simplify(Substitute(curr_indice_bytes, curr_sub));
+
+      // Handle Ramp expressions by creating a new index variable
+      if (const RampNode *prev_ramp = prev_indice_bytes.as<RampNode>()) {
+        Var prev_idx("prev_idx", DataType::Int(32));
+        analyzer.Bind(prev_idx, Range::FromMinExtent(0, prev_ramp->lanes));
+        prev_indice_bytes = prev_ramp->base + prev_idx * prev_ramp->stride;
+      }
 
-    if (!iv->dom.defined()) {
-      return true;
-    }
+      if (const RampNode *curr_ramp = curr_indice_bytes.as<RampNode>()) {
+        Var curr_idx("curr_idx", DataType::Int(32));
+        analyzer.Bind(curr_idx, Range::FromMinExtent(0, curr_ramp->lanes));
+        curr_indice_bytes = curr_ramp->base + curr_idx * curr_ramp->stride;
+      }
 
-    const auto *min_node = iv->dom->min.as<IntImmNode>();
-    const auto *extent_node = iv->dom->extent.as<IntImmNode>();
+      // Now handle the simplified expressions
+      if (prev_indice_bytes.dtype().is_scalar() &&
+          curr_indice_bytes.dtype().is_scalar()) {
+        if (prev_indice_bytes.dtype() != curr_indice_bytes.dtype()) {
+          if (prev_indice_bytes.dtype().bits() <
+              curr_indice_bytes.dtype().bits()) {
+            prev_indice_bytes =
+                tir::Cast(curr_indice_bytes.dtype(), prev_indice_bytes);
+          } else {
+            curr_indice_bytes =
+                tir::Cast(prev_indice_bytes.dtype(), curr_indice_bytes);
+          }
+        }
+        ICHECK(prev_indice_bytes.dtype() == curr_indice_bytes.dtype());
+        provably_disjoint =
+            analyzer.CanProve(tir::NE(prev_indice_bytes, curr_indice_bytes));
+      } else {
+        try {
+          auto prev_min = analyzer.Simplify(
+              Substitute(prev.touched[i].min() * prev_dtype.bytes(), prev_sub));
+          auto prev_max = analyzer.Simplify(
+              Substitute(prev.touched[i].max() * prev_dtype.bytes(), prev_sub));
+          auto curr_min = analyzer.Simplify(
+              Substitute(curr.touched[i].min() * curr_dtype.bytes(), curr_sub));
+          auto curr_max = analyzer.Simplify(
+              Substitute(curr.touched[i].max() * curr_dtype.bytes(), curr_sub));
+          provably_disjoint = analyzer.CanProve(analyzer.Simplify(
+              tir::Or(prev_min > curr_max, curr_min > prev_max)));
+        } catch (const std::exception &e) {
+          auto prev_bound = analyzer.const_int_bound(prev_indice_bytes);
+          auto curr_bound = analyzer.const_int_bound(curr_indice_bytes);
+          if (prev_bound.defined() && curr_bound.defined()) {
+            if ((prev_bound->min_value) > (curr_bound->max_value) ||
+                (curr_bound->min_value) > (prev_bound->max_value)) {
+              range_is_overlap = false;
+              break;
+            }
+          }
+        }
+        // if (!provably_disjoint) {
+        //   LOG(WARNING) << analyzer.z3_prover.GetStats();
+        //   LOG(WARNING) <<
+        //   analyzer.z3_prover.GetSMTLIB2(tir::Not(tir::Or(prev_min >
+        //   curr_max, curr_min > prev_max)));
+        // }
+      }
 
-    int64_t min = min_node->value;
-    int64_t extent = extent_node->value;
-    int64_t max = min + extent - 1;
+      if (provably_disjoint) {
+        range_is_overlap = false;
+        break;
+      }
+    }
 
-    return min == bound->min_value && max == bound->max_value;
+    return range_is_overlap;
   }
 
-  // Member variables
-  IterVar tx_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("tx"), IterVarType::kDataPar);
-  IterVar ty_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("ty"), IterVarType::kDataPar);
-  IterVar tz_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("tz"), IterVarType::kDataPar);
-  std::unordered_map<ThreadBoundKey, size_t> barrier_id_map_;
-  std::unordered_map<ThreadBoundKey, size_t> thread_count_map_;
+  bool FindConflict(const std::vector<AccessEntry> &prev,
+                    const AccessEntry &curr, const ForNode *loop) {
+    for (const AccessEntry &x : prev) {
+      if (FindConflict(x, curr, loop)) {
+        return true;
+      }
+    }
+    return false;
+  }
 };
 
 PrimFunc TileLangThreadSync(PrimFunc func, const std::string &storage_scope) {
@@ -818,12 +1655,19 @@ PrimFunc TileLangThreadSync(PrimFunc func, const std::string &storage_scope) {
   if (sync_scope.rank == StorageRank::kShared && sync_scope.tag.empty()) {
     stmt = ThreadSyncAfterWaitQueueInserter(sync_scope)(stmt);
   }
-  TileLangThreadSyncPlanner planner(sync_scope);
+  // Get warp size from target, defaulting to 32 if not available
+  int warp_size = 32;
+  if (auto target = func->GetAttr<Target>(tvm::attr::kTarget)) {
+    warp_size = target.value()
+                    ->GetAttr<Integer>("thread_warp_size", 32)
+                    .value()
+                    .IntValue();
+  }
+  TileLangThreadSyncPlanner planner(sync_scope, warp_size);
   for (const auto &[_, buffer] : func->buffer_map) {
     planner.SetBufferDataToBuffer(buffer->data, buffer);
   }
   planner(stmt);
-
   stmt =
       ThreadSyncInserter(sync_scope, planner.syncs_inserted_)(std::move(stmt));
   n->body = ThreadPartialSyncRewriter::Rewrite(std::move(stmt));
diff --git a/src/transform/unroll_loop.cc b/src/transform/unroll_loop.cc
new file mode 100644
index 000000000..73ac9b879
--- /dev/null
+++ b/src/transform/unroll_loop.cc
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Loop unrolling as in Halide pipeline.
+ * \file unroll_loop.cc
+ */
+// Unrolls the loop as in Halide pipeline.
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_set>
+
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+struct UnrollLoopConfigNode
+    : public AttrsNodeReflAdapter<UnrollLoopConfigNode> {
+  int auto_max_step;
+  int auto_max_depth;
+  int auto_max_extent;
+  int explicit_unroll;
+  int unroll_local_access;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<UnrollLoopConfigNode>()
+        .def_ro("auto_max_step", &UnrollLoopConfigNode::auto_max_step,
+                "Threshold of number of steps in the loop to be automatically "
+                "unrolled",
+                refl::DefaultValue(0))
+        .def_ro("auto_max_depth", &UnrollLoopConfigNode::auto_max_depth,
+                "The maximum nested level of loops that can be automatically "
+                "unrolled.",
+                refl::DefaultValue(8))
+        .def_ro("auto_max_extent", &UnrollLoopConfigNode::auto_max_extent,
+                "The maximum extent` of loop that will be unrolled.",
+                refl::DefaultValue(0))
+        .def_ro(
+            "explicit_unroll", &UnrollLoopConfigNode::explicit_unroll,
+            "Whether to explicitly unroll the loop instead of setting a pragma",
+            refl::DefaultValue(true))
+        .def_ro(
+            "unroll_local_access", &UnrollLoopConfigNode::unroll_local_access,
+            "Whether to always unroll local access", refl::DefaultValue(false));
+  }
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.transform.UnrollLoopConfig",
+                                    UnrollLoopConfigNode, BaseAttrsNode);
+};
+
+class UnrollLoopConfig : public Attrs {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(UnrollLoopConfig, Attrs,
+                                                UnrollLoopConfigNode);
+};
+
+TVM_FFI_STATIC_INIT_BLOCK() { UnrollLoopConfigNode::RegisterReflection(); }
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tl.UnrollLoop", UnrollLoopConfig);
+
+class VarLocalAccessMarker : public ExprVisitor {
+public:
+  explicit VarLocalAccessMarker(std::unordered_set<Var> *var_touched_local)
+      : var_touched_local_(var_touched_local) {}
+
+  void VisitExpr_(const VarNode *op) final {
+    var_touched_local_->insert(ffi::GetRef<Var>(op));
+  }
+
+private:
+  std::unordered_set<Var> *var_touched_local_;
+};
+
+// The Visitor is used to check whether var is used as write index in a local
+// memory If a loop var is used as indices to a local memory, it must be
+// unrolled so the local memory access can be turned into register access.
+class LoopUnroller : public StmtExprMutator {
+public:
+  explicit LoopUnroller(int auto_max_step, int auto_max_depth,
+                        int auto_max_extent, bool explicit_unroll,
+                        bool unroll_local_access)
+      : auto_max_step_(auto_max_step), auto_max_depth_(auto_max_depth),
+        auto_max_extent_(auto_max_extent), explicit_unroll_(explicit_unroll),
+        unroll_local_access_(unroll_local_access) {}
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == "pragma_auto_unroll_max_step") {
+      int value = static_cast<int>(Downcast<Integer>(op->value)->value);
+      std::swap(value, auto_max_step_);
+      Stmt ret = this->VisitStmt(op->body);
+      std::swap(value, auto_max_step_);
+      return ret;
+    } else if (op->attr_key == "pragma_unroll_explicit") {
+      bool explicit_unroll = Downcast<Integer>(op->value)->value;
+      std::swap(explicit_unroll, explicit_unroll_);
+      Stmt ret = this->VisitStmt(op->body);
+      std::swap(explicit_unroll, explicit_unroll_);
+      return ret;
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const ForNode *op) {
+    // Post order so we can collect more information
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<ForNode>();
+    int value = GetExtent(op);
+    // condition for auto unroll
+    bool auto_unroll =
+        (op->kind == ForKind::kSerial && value >= 0 &&
+         normal_loop_depth_ == 0 && unroll_depth_ <= auto_max_depth_);
+
+    auto_unroll = auto_unroll && (value * step_count_ <= auto_max_step_ ||
+                                  value <= auto_max_extent_);
+
+    if (op->kind == ForKind::kUnrolled) {
+      if (explicit_unroll_) {
+        ICHECK_GE(value, 0)
+            << "Cannot unroll non-constant loop " << explicit_unroll_;
+      }
+      auto_unroll = true;
+    }
+
+    // If a loop var is used as indices to a local memory, it must be unrolled
+    // so the local memory access can be turned into register access.
+    if (this->var_touched_local_.count(op->loop_var) && value > 0 &&
+        unroll_local_access_) {
+      auto_unroll = true;
+    }
+
+    if (auto_unroll) {
+      step_count_ *= value;
+      unroll_depth_ += 1;
+    } else {
+      normal_loop_depth_ += 1;
+    }
+
+    if ((auto_unroll && explicit_unroll_) ||
+        // unroll loops with extent = 1, no matter how many steps in body
+        (0 <= value && value <= auto_max_extent_ && auto_max_extent_ == 1)) {
+      return Unroll(op);
+    } else {
+      if (auto_unroll) {
+        if (op->kind != ForKind::kUnrolled) {
+          auto n = CopyOnWrite(op);
+          n->kind = ForKind::kUnrolled;
+          return For(n);
+        }
+      }
+      return stmt;
+    }
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    if (unroll_local_access_) {
+      auto storage_scope =
+          runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
+      if (storage_scope.rank == runtime::StorageRank::kLocal ||
+          storage_scope.rank == runtime::StorageRank::kWarp) {
+        VarLocalAccessMarker marker(&var_touched_local_);
+        for (PrimExpr e : op->indices) {
+          marker(e);
+        }
+      }
+    }
+    return ffi::GetRef<PrimExpr>(op);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    ++step_count_;
+    if (unroll_local_access_) {
+      auto storage_scope =
+          runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
+      if (storage_scope.rank == runtime::StorageRank::kLocal ||
+          storage_scope.rank == runtime::StorageRank::kWarp) {
+        VarLocalAccessMarker marker(&var_touched_local_);
+        for (PrimExpr e : op->indices) {
+          marker(e);
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    ++step_count_;
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    auto fmutate = [this](const Stmt &s) {
+      int step_count = step_count_;
+      int unroll_depth = unroll_depth_;
+      int normal_loop_depth = normal_loop_depth_;
+      step_count_ = 0;
+      unroll_depth_ = 0;
+      normal_loop_depth_ = 0;
+      Stmt ret = this->VisitStmt(s);
+      step_count_ += step_count;
+      normal_loop_depth_ = std::max(normal_loop_depth, normal_loop_depth_);
+      unroll_depth_ = std::max(unroll_depth_, unroll_depth);
+      return ret;
+    };
+    return StmtExprMutator::VisitSeqStmt_(op, false, fmutate);
+  }
+
+  Stmt Unroll(const ForNode *op) {
+    int value = GetExtent(op);
+    // For loop must have a constant integer extent
+    ICHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
+    if (value == 0)
+      return Evaluate(0);
+    Stmt body = op->body;
+    ffi::Map<Var, PrimExpr> vmap;
+    ffi::Array<Stmt> unrolled;
+    for (int i = 0; i < value; ++i) {
+      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.dtype(), i));
+      Stmt step = Substitute(body, vmap);
+      unrolled.push_back(step);
+    }
+    return SeqStmt::Flatten(unrolled);
+  }
+
+private:
+  // returns the extent of the loop if it's a constant integer, otherwise return
+  // -1
+  int GetExtent(const ForNode *op) {
+    // constant folding.
+    PrimExpr extent = analyzer_.Simplify(op->extent);
+    const IntImmNode *v1 = extent.as<IntImmNode>();
+    int value = -1;
+    // integers that do not fit in int32_t are treated as symbolic,
+    // as it's impossible to unroll such large loops
+    if (v1 != nullptr && v1->value <= std::numeric_limits<int>::max()) {
+      value = static_cast<int>(v1->value);
+    }
+    return value;
+  }
+
+  // maximum number of step to perform auto unroll.
+  int auto_max_step_;
+  int auto_max_depth_;
+  // max extent of loop to auto unroll
+  // this does not count the total steps, only count the number of loops
+  int auto_max_extent_;
+  bool explicit_unroll_;
+  // Wether to unroll loops to local access.
+  bool unroll_local_access_{false};
+  // Number of normal loops in scope
+  int normal_loop_depth_{0};
+  // number of unrolled cases in current scope.
+  int unroll_depth_{0};
+  // Number of total steps unrolled
+  int step_count_{0};
+  // set of indices touched during visit local memory
+  std::unordered_set<Var> var_touched_local_;
+  // analyzer
+  arith::Analyzer analyzer_;
+};
+
+Stmt UnrollLoop(Stmt stmt, UnrollLoopConfig cfg) {
+  Stmt ret = LoopUnroller(cfg->auto_max_step, cfg->auto_max_depth,
+                          cfg->auto_max_extent, cfg->explicit_unroll,
+                          cfg->unroll_local_access)(stmt);
+  if (!ret.same_as(stmt)) {
+    return ConvertSSA(ret);
+  } else {
+    return ret;
+  }
+}
+
+namespace transform {
+
+using namespace tir::transform;
+
+Pass UnrollLoop() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto *n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<UnrollLoopConfig>("tl.UnrollLoop");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<UnrollLoopConfig>();
+    }
+    n->body = tl::UnrollLoop(f->body, cfg.value());
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.UnrollLoop", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.UnrollLoop", UnrollLoop);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/vectorize_loop.cc b/src/transform/vectorize_loop.cc
index a7b31e1d7..79e87550f 100644
--- a/src/transform/vectorize_loop.cc
+++ b/src/transform/vectorize_loop.cc
@@ -37,8 +37,12 @@
 #include <utility>
 #include <vector>
 
+#include "../op/builtin.h"
+#include "../target/utils.h"
+#include "arith/ir_mutator_with_analyzer.h"
 #include "arith/scalable_expression.h"
 #include "tir/analysis/check_contains.h"
+#include "tvm/ffi/cast.h"
 
 namespace tvm {
 namespace tl {
@@ -118,6 +122,52 @@ inline PrimExpr BroadcastTo(PrimExpr e, int lanes, bool is_scalable) {
   return Broadcast(e, CreateNewLanes(is_scalable, lanes));
 }
 
+/*!
+ * \brief Extract BufferLoad from an expression that may be wrapped in
+ * address_of.
+ */
+inline Optional<BufferLoad> ExtractBufferLoadForAtomic(const PrimExpr &expr) {
+  if (const auto *load = expr.as<BufferLoadNode>()) {
+    return tvm::ffi::GetRef<BufferLoad>(load);
+  }
+  if (const auto *call = expr.as<CallNode>()) {
+    if (call->op.same_as(builtin::address_of()) && !call->args.empty()) {
+      if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+        return tvm::ffi::GetRef<BufferLoad>(load);
+      }
+    }
+  }
+  return Optional<BufferLoad>();
+}
+
+/*!
+ * \brief Get the vectorized atomic add op based on vector size.
+ */
+inline Op GetVectorizedAtomicOp(int vector_size) {
+  switch (vector_size) {
+  case 4:
+    return atomic_addx4_elem_op();
+  case 2:
+    return atomic_addx2_elem_op();
+  default:
+    return atomic_add_elem_op();
+  }
+}
+
+/*!
+ * \brief Get the max vector size supported by the given dtype for atomic ops.
+ */
+inline int GetMaxAtomicVectorSize(DataType dtype, Target target) {
+  if (dtype.is_float16() || dtype.is_bfloat16()) {
+    return 2;
+  }
+  if (dtype.is_float() && dtype.bits() == 32 &&
+      TargetHasSMVersionGE(target, 90)) {
+    return 4;
+  }
+  return 1;
+}
+
 // Rewrite vectorized allocation access
 // This is necessary for making each vector component containing its own
 // workspace. Originates from Halide's loop vectorizer
@@ -212,27 +262,31 @@ class TLVectorizer : public StmtMutator,
 
   // Convenience entry to vectorize a loop body without exposing
   // the mutator invocation pattern at call sites.
-  static Stmt Vectorize(const Var &var, const PrimExpr &var_lanes, Stmt body) {
-    TLVectorizer vec{var, var_lanes};
+  static Stmt Vectorize(const Var &var, const PrimExpr &var_lanes, Stmt body,
+                        arith::Analyzer *analyzer) {
+    TLVectorizer vec{var, var_lanes, analyzer};
+    Stmt original_body = body;
     auto vec_stmt = vec(std::move(body));
+    // If scalarization is needed, scalarize the entire original body
+    if (vec.need_scalarize_) {
+      return vec.Scalarize(original_body);
+    }
     return vec_stmt;
   }
 
-  TLVectorizer(const Var &var, const PrimExpr &var_lanes)
-      : var_(var), var_lanes_(var_lanes) {
+  TLVectorizer(const Var &var, const PrimExpr &var_lanes,
+               arith::Analyzer *analyzer)
+      : var_(var), var_lanes_(var_lanes), analyzer_(analyzer) {
     ramp_ = Ramp(IntImm(var->dtype, 0), IntImm(var->dtype, 1), var_lanes);
   }
 
   Stmt VisitStmt(const Stmt &stmt) final {
-    ICHECK(!need_scalarize_);
-    Stmt ret = StmtMutator::VisitStmt(stmt);
+    // If scalarization is already needed, return original stmt unchanged
+    // to let the top-level Vectorize handle it
     if (need_scalarize_) {
-      auto scalarized_stmt = Scalarize(stmt);
-      need_scalarize_ = false;
-      return scalarized_stmt;
-    } else {
-      return ret;
+      return stmt;
     }
+    return StmtMutator::VisitStmt(stmt);
   }
 
   PrimExpr VisitExpr(const PrimExpr &e) final {
@@ -266,11 +320,11 @@ class TLVectorizer : public StmtMutator,
       if (is_vec_a || is_vec_b) {
         const RampNode *b_ramp = b.as<RampNode>();
         const RampNode *a_ramp = a.as<RampNode>();
-        if (a_ramp && b.dtype().is_scalar() && analyzer_.CanProve(b > 0)) {
+        if (a_ramp && b.dtype().is_scalar() && analyzer_->CanProve(b > 0)) {
           PrimExpr lanes = a_ramp->lanes;
           return Ramp(a_ramp->base * b, a_ramp->stride * b, lanes);
         }
-        if (b_ramp && a.dtype().is_scalar() && analyzer_.CanProve(a > 0)) {
+        if (b_ramp && a.dtype().is_scalar() && analyzer_->CanProve(a > 0)) {
           PrimExpr lanes = b_ramp->lanes;
           return Ramp(b_ramp->base * a, b_ramp->stride * a, lanes);
         }
@@ -327,9 +381,9 @@ class TLVectorizer : public StmtMutator,
       int op_lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
       int base_ramp_lanes =
           static_cast<int>(Downcast<IntImm>(base_ramp->lanes)->value);
-      if (analyzer_.CanProve(base_ramp->stride ==
-                             stride *
-                                 make_const(stride.dtype(), base_ramp_lanes))) {
+      if (analyzer_->CanProve(
+              base_ramp->stride ==
+              stride * make_const(stride.dtype(), base_ramp_lanes))) {
         return Ramp(base_ramp->base, stride, op_lanes * base_ramp_lanes);
       }
     }
@@ -419,9 +473,15 @@ class TLVectorizer : public StmtMutator,
       return std::move(var);
     }
   }
+
   // IfThenElse expr
   PrimExpr MutateIfThenElseExpr_(const CallNode *op) {
-    PrimExpr cond = this->VisitExpr(op->args[0]);
+    PrimExpr cond = op->args[0];
+    PrimExpr cond_zeroed = Substitute(cond, {{var_, IntImm(var_->dtype, 0)}});
+    if (analyzer_->CanProve(cond == cond_zeroed)) {
+      cond = cond_zeroed;
+    }
+    cond = this->VisitExpr(cond);
     if (cond.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
       return tvm::ffi::GetRef<PrimExpr>(op);
@@ -447,6 +507,34 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
+  // Address of: remove vectorized var from indices to get base address
+  // e.g., T.address_of(buf[base + vec]) -> T.address_of(buf[base])
+  PrimExpr MutateAddressOfCall_(const CallNode *op) {
+    ICHECK(op->op.same_as(builtin::address_of()));
+    ICHECK_EQ(op->args.size(), 1);
+
+    auto buffer_load = op->args[0].as<BufferLoadNode>();
+    if (!buffer_load) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Remove the vectorized var from indices by substituting var_ with 0
+    Array<PrimExpr> new_indices;
+    for (const auto &index : buffer_load->indices) {
+      PrimExpr new_index = Substitute(index, {{var_, IntImm(var_->dtype, 0)}});
+      new_indices.push_back(analyzer_->Simplify(new_index));
+    }
+
+    BufferLoad new_load = GetRef<BufferLoad>(buffer_load);
+    if (!new_indices.same_as(buffer_load->indices)) {
+      auto writer = new_load.CopyOnWrite();
+      writer->indices = new_indices;
+    }
+
+    return Call(op->dtype, op->op, {new_load});
+  }
+
   // Reinterpret expr
   PrimExpr MutateReinterpretExpr_(const CallNode *op) {
     ICHECK(op->op.same_as(builtin::reinterpret()));
@@ -463,6 +551,46 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+  // Atomic add vectorization
+  PrimExpr MutateAtomicAddExpr_(const CallNode *op) {
+    ICHECK(op->op.same_as(atomic_add_elem_op()));
+
+    // Must have at least 2 args (dst_ptr and src)
+    if (op->args.size() < 2) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Get the vector size from var_lanes_
+    auto lanes_ptr = as_const_int(var_lanes_);
+    if (!lanes_ptr || *lanes_ptr <= 1) {
+      // Not in vectorized context or vector size is 1
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+    int vector_size = static_cast<int>(*lanes_ptr);
+
+    auto dst = VisitExpr(op->args[0]);
+    auto src = VisitExpr(op->args[1]);
+
+    // If src is not Ramp/Broadcasted, it must be a scalar or something.
+    // Broadcast to vector size if needed
+    if (src.same_as(op->args[1])) {
+      src = BroadcastTo(src, vector_size, src.dtype().is_scalable_vector());
+    }
+
+    // Check if dtype supports this vector size
+    auto dst_buffer_load = ExtractBufferLoadForAtomic(dst);
+    Target target = Target::Current(false);
+    int max_vec_size =
+        GetMaxAtomicVectorSize(dst_buffer_load.value()->buffer->dtype, target);
+    if (vector_size > max_vec_size) {
+      // Vector size not supported for this dtype, cannot vectorize
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Return the vectorized atomic op
+    return Call(op->dtype, GetVectorizedAtomicOp(vector_size), {dst, src});
+  }
+
   // Call
   PrimExpr VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(builtin::if_then_else())) {
@@ -484,6 +612,11 @@ class TLVectorizer : public StmtMutator,
       return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
+    } else if (op->op.same_as(atomic_add_elem_op())) {
+      // Handle vectorization of atomic_add_elem_op
+      return MutateAtomicAddExpr_(op);
+    } else if (op->op.same_as(builtin::address_of())) {
+      return MutateAddressOfCall_(op);
     }
     auto optional_op = op->op.as<Op>();
     bool vectorizable = optional_op &&
@@ -516,6 +649,7 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     auto load = tvm::ffi::GetRef<BufferLoad>(op);
@@ -533,6 +667,7 @@ class TLVectorizer : public StmtMutator,
 
     return std::move(load);
   }
+
   // Let
   PrimExpr VisitExpr_(const LetNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
@@ -564,6 +699,7 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
   // BufferStore
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     auto store = tvm::ffi::GetRef<BufferStore>(op);
@@ -620,6 +756,7 @@ class TLVectorizer : public StmtMutator,
 
     return std::move(store);
   }
+
   // For
   Stmt VisitStmt_(const ForNode *op) final {
     if (op->kind == ForKind::kVectorized) {
@@ -639,6 +776,7 @@ class TLVectorizer : public StmtMutator,
                  op->thread_binding, op->annotations);
     }
   }
+
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode *op) final {
     ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
@@ -658,10 +796,12 @@ class TLVectorizer : public StmtMutator,
       return IfThenElse(condition, then_case, else_case);
     }
   }
+
   // While
   Stmt VisitStmt_(const WhileNode *op) final {
     LOG(FATAL) << "A while loop inside a vectorized loop not supported.";
   }
+
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
@@ -729,7 +869,7 @@ class TLVectorizer : public StmtMutator,
 
 private:
   // analyzer
-  arith::Analyzer analyzer_;
+  arith::Analyzer *analyzer_;
   // deep equal
   ExprDeepEqual deep_equal_;
   // variable to be replaced
@@ -828,9 +968,12 @@ inline bool TargetHasSVE() {
   return Target::Current()->GetFeature<Bool>("has_sve").value_or(false);
 }
 
-class LoopVectorizer : public StmtMutator {
+class LoopVectorizer : public arith::IRMutatorWithAnalyzer {
 public:
+  LoopVectorizer(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
   Stmt VisitStmt_(const ForNode *op) final {
+    analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
     if (op->kind == ForKind::kVectorized) {
       auto *extent_as_int = op->extent.as<IntImmNode>();
 
@@ -842,7 +985,8 @@ class LoopVectorizer : public StmtMutator {
             << " for target " << Target::Current();
       }
       ICHECK(is_zero(op->min));
-      return TLVectorizer::Vectorize(op->loop_var, op->extent, op->body);
+      return TLVectorizer::Vectorize(op->loop_var, op->extent, op->body,
+                                     analyzer_);
     } else {
       return StmtMutator::VisitStmt_(op);
     }
@@ -869,7 +1013,8 @@ tvm::transform::Pass VectorizeLoop(bool enable_vectorize = true) {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
     auto *n = f.CopyOnWrite();
     if (enable_vectorize) {
-      n->body = tvm::tl::LoopVectorizer()(std::move(n->body));
+      arith::Analyzer analyzer;
+      n->body = tvm::tl::LoopVectorizer(&analyzer)(std::move(n->body));
     } else {
       n->body = tvm::tl::VectorizeSkipper()(std::move(n->body));
     }
diff --git a/src/transform/verify_parallel_loop.cc b/src/transform/verify_parallel_loop.cc
new file mode 100644
index 000000000..672ffa07c
--- /dev/null
+++ b/src/transform/verify_parallel_loop.cc
@@ -0,0 +1,114 @@
+#include "common/constr_visitor.h"
+#include "layout_reducer.h"
+#include "tvm/arith/analyzer.h"
+#include "tvm/ffi/base_details.h"
+#include "tvm/ffi/object.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/op.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/var.h"
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm::tl {
+
+using namespace tir;
+
+namespace {
+using tvm::tl::ConstrSet;
+using tvm::tl::ConstrVisitor;
+
+struct ParallelLoopVerifier : public ConstrVisitor {
+  std::vector<Var> parallel_loop_vars_;
+  std::unordered_set<Var, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> reducers;
+
+  void VisitStmt_(const ForNode *op) override {
+    if (op->kind == ForKind::kParallel) {
+      parallel_loop_vars_.push_back(op->loop_var);
+      ConstrVisitor::VisitStmt_(op);
+      parallel_loop_vars_.pop_back();
+    } else {
+      ConstrVisitor::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const BufferStoreNode *op) override {
+    if (reducers.count(op->buffer->data) || op->buffer.scope() == "local.var" ||
+        op->buffer.scope() == "local") {
+      StmtExprVisitor::VisitStmt_(op);
+      return;
+    }
+    ConstrSet cset{constr_stack_};
+    std::vector<Var> other_thread_vars_;
+    ffi::Map<Var, PrimExpr> subs;
+    for (const auto &var : parallel_loop_vars_) {
+      Var v_other_thread(var->name_hint + "<OTHER>", var->dtype);
+      other_thread_vars_.push_back(v_other_thread);
+      subs.Set(var, v_other_thread);
+    }
+    cset.Extend(cset.Substitute(subs));
+    for (const auto &idx : op->indices) {
+      cset.AddConstr(idx == tir::Substitute(idx, subs));
+    }
+    arith::Analyzer analyzer;
+    cset.Populate(analyzer);
+    // If we can prove the values are the same, then no data race can happen.
+    if (analyzer.CanProve(op->value == tir::Substitute(op->value, subs))) {
+      StmtExprVisitor::VisitStmt_(op);
+      return;
+    }
+    ffi::Array<Var> failed_vars;
+    PrimExpr failed_var_expr;
+    for (auto [k, v] : subs) {
+      if (!analyzer.CanProve(k == v)) {
+        failed_vars.push_back(k);
+        failed_var_expr =
+            failed_var_expr.defined() ? And(failed_var_expr, k == v) : (k == v);
+      }
+    }
+    if (!failed_vars.empty()) {
+      LOG(WARNING) << "Data race detected: `" << op->buffer << op->indices
+                   << "`"
+                   << "is written by multiple threads in loop " << failed_vars
+                   << ", Example:\n"
+                   << analyzer.z3_prover.GetModel(failed_var_expr)
+                   << "If you believe this is a false positive, pass "
+                      "`PassKey.TL_DISABLE_DATA_RACE_CHECK` to pass key to "
+                      "disable this check.";
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+  void VisitStmt_(const BlockNode *op) override {
+    if (op->annotations.count(attr::kReducerInfo)) {
+      auto map = op->annotations.Get(attr::kReducerInfo)
+                     ->as<Map<Var, Map<String, String>>>();
+      ICHECK(map) << "reducer_replication map is not defined";
+      for (const auto &[var, info] : map.value()) {
+        reducers.insert(var);
+      }
+    }
+    return StmtExprVisitor::VisitStmt_(op);
+  }
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass VerifyParallelLoop() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    ParallelLoopVerifier verifier;
+    verifier(f->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.VerifyParallelLoop", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.VerifyParallelLoop", VerifyParallelLoop);
+}
+
+} // namespace
+
+} // namespace tvm::tl
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
index fd02c0240..d9af1ae16 100644
--- a/src/transform/warp_specialized_rewriter.cc
+++ b/src/transform/warp_specialized_rewriter.cc
@@ -50,6 +50,7 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
 public:
   auto FindProducerusedBuffer(const Stmt &stmt) {
     producer_buffers_.clear();
+    let_var_to_expr_.clear();
     std::unordered_set<const BufferNode *> last_producer_buffers_;
     for (;;) {
       VisitStmt(stmt);
@@ -68,6 +69,28 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
     for (const auto &buffer : usage.buffer_use_count_) {
       producer_buffers_.insert(buffer.first);
     }
+    // Also collect buffers through let bindings
+    CollectBuffersFromExpr(expr);
+  }
+
+  // Collect buffers from expression, following let bindings
+  void CollectBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        producer_buffers_.insert(bl->buffer.get());
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        auto it = let_var_to_expr_.find(var.get());
+        if (it != let_var_to_expr_.end()) {
+          CollectBuffersFromExpr(it->second);
+        }
+      }
+    });
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    let_var_to_expr_[op->var.get()] = op->value;
+    StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
@@ -102,15 +125,15 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
   void VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
       for (auto arg : op->args) {
-        if (auto buffer_load = arg.as<BufferLoadNode>()) {
-          producer_buffers_.insert(buffer_load->buffer.get());
-        }
+        // Collect buffers from args, including through let bindings
+        CollectBuffersFromExpr(arg);
       }
     }
   }
 
 private:
   std::unordered_set<const BufferNode *> producer_buffers_;
+  std::unordered_map<const VarNode *, PrimExpr> let_var_to_expr_;
 };
 
 class WarpSpecializedRoleMarker : public StmtVisitor {
@@ -141,6 +164,9 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
       if (call->op.same_as(loop_break())) {
         role = Role::kBoth;
       }
+      if (call->op.same_as(pdl_sync()) || call->op.same_as(pdl_trigger())) {
+        role = Role::kBoth;
+      }
     }
     SetRole(op, role);
   }
@@ -716,7 +742,23 @@ class WSCodeEmitter : public StmtMutator {
                                 : parity_;
           block_stmt.push_back(makeParityWait(acquire_barrier_id, parity));
         }
-        ICHECK(!map.release[i].empty());
+        // It is possible that a producer does not participate in any
+        // producer-consumer dependency that requires synchronization.
+        // In that case, there will be no associated release pattern.
+        // We should still emit the (optionally guarded) statement without
+        // inserting any mbarrier for it instead of failing.
+        if (map.release[i].empty()) {
+          LOG(WARNING) << "Producer doesn't have corresponding consumer: "
+                       << seq_transformed[i];
+          block_stmt.push_back(seq_transformed[i]);
+          new_body.push_back(
+              MakeGroupBlock(block_stmt.size() == 1
+                                 ? block_stmt[0]
+                                 // NOLINTNEXTLINE(performance-move-const-arg)
+                                 : SeqStmt(std::move(block_stmt)),
+                             annotations));
+          continue;
+        }
         for (size_t j = 0; j < map.release[i].size(); j++) {
           int pattern_idx = map.release[i][j];
           PrimExpr release_barrier_id =
diff --git a/testing/conftest.py b/testing/conftest.py
index 9f49d40a9..4010e0d83 100644
--- a/testing/conftest.py
+++ b/testing/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index bf4d49e41..b26354830 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -1,10 +1,12 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tilelang.intrinsics import make_mfma_swizzle_layout as make_swizzle_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(0)
@@ -22,19 +24,9 @@ def tl_matmul(
     b_transposed=True,
     k_pack=1,
 ):
-    assert in_dtype in [
-        "float16",
-        "int8",
-    ], "Currently only float16 and int8 are supported"
-    assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
-    ], "Currently only float16, float32 and int32 are supported"
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -87,12 +79,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -100,10 +91,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -111,7 +104,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -125,7 +117,6 @@ def main(
                     T.copy(B[ko * block_K, bx * block_N], B_shared)
 
                 for ki in T.serial(0, (block_K // (k_pack * micro_size_k))):
-
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -169,17 +160,8 @@ def main(
     return main
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack)
+def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.float32, a_transposed=False, b_transposed=True, k_pack=1):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -187,9 +169,12 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
+    elif in_dtype == T.float8_e4m3fnuz:
+        A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
+        B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
         A = torch.rand(A_shape, device="cuda", dtype=getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=getattr(torch, in_dtype))
@@ -207,16 +192,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -226,24 +208,37 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack",
+    [
+        (128, 128, 128, T.float16, T.float16, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 2),
+        (128, 128, 128, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 2),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 2),
+        (128, 128, 128, T.float8_e4m3fnuz, T.float16, T.float32, False, True, 1),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
+def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack):
     assert_tl_matmul_correctness(
-        128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+    )
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index 73cdc280b..dc95eb701 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
@@ -23,10 +24,9 @@ def tl_matmul(
     b_preshuffle=False,
     b_g2l_load=False,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -53,18 +53,21 @@ def tl_matmul(
 
     A_shape = (K, M) if a_transposed else (M, K)
     if b_preshuffle:
-        B_shape = (N // micro_size_y, K // pack_size_k, micro_size_y,
-                   pack_size_k) if b_transposed else (K // pack_size_k, N // micro_size_y,
-                                                      pack_size_k, micro_size_y)
+        B_shape = (
+            (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shape = (N, K) if b_transposed else (K, N)
 
     A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
     if b_preshuffle:
-        B_shared_shape = (block_N // micro_size_y, block_K // pack_size_k, micro_size_y,
-                          pack_size_k) if b_transposed else (block_K // pack_size_k,
-                                                             block_N // micro_size_y, pack_size_k,
-                                                             micro_size_y)
+        B_shared_shape = (
+            (block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shared_shape = (block_N, block_K) if b_transposed else (block_K, block_N)
 
@@ -94,21 +97,22 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             num_ko = K // block_K
             num_ki = block_K // (k_pack * micro_size_k)
@@ -119,7 +123,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(num_ko, num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -129,20 +132,13 @@ def main(
                 # Load B into shared memory
                 if b_g2l_load is False:
                     if b_transposed:
-                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y,
-                                                       block_K // pack_size_k, micro_size_y,
-                                                       pack_size_k):
-                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j,
-                                                       ko * block_K // pack_size_k + k, jj, kk]
+                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
+                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
                     else:
-                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k,
-                                                       block_N // micro_size_y, pack_size_k,
-                                                       micro_size_y):
-                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k,
-                                                       bx * block_N // micro_size_y + j, kk, jj]
+                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
+                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
 
                 for ki in T.serial(0, num_ki):
-
                     # Load A S2L
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -176,10 +172,10 @@ def main(
 
 
 def shuffle_weight(
-        x: torch.Tensor,
-        layout=(16, 32),
-        k_pack=1,
-        is_transpose=False,
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
 ) -> torch.Tensor:
     IN, IK = layout
     BK = IK * k_pack
@@ -194,19 +190,20 @@ def shuffle_weight(
     return x.contiguous()
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1,
-                                 b_preshuffle=False,
-                                 b_g2l_load=False):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack, b_preshuffle, b_g2l_load)
+def assert_tl_matmul_correctness(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype=T.float32,
+    a_transposed=False,
+    b_transposed=True,
+    k_pack=1,
+    b_preshuffle=False,
+    b_g2l_load=False,
+):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -214,9 +211,12 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
+    elif in_dtype == T.float8_e4m3fnuz:
+        A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
+        B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
         A = torch.rand(A_shape, device="cuda", dtype=getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=getattr(torch, in_dtype))
@@ -241,16 +241,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -261,27 +258,46 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
+    [
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 2, True, False),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
-
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
+def test_assert_tl_matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    a_transposed,
+    b_transposed,
+    k_pack,
+    b_preshuffle,
+    b_g2l_load,
+):
     assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "int8",
-        "int32",
-        b_transposed=False,
-        accum_dtype="int32",
-        k_pack=2,
-        b_preshuffle=True)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+        b_g2l_load=b_g2l_load,
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index 456a3ae46..4035c299c 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -1,3 +1,4 @@
+import pytest
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
@@ -27,8 +28,7 @@ def matmul(
     vec_size = 4 * k_pack
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -96,33 +96,49 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_f16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.float16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16bf16f32():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16bf16f32(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 def matmul_rs(
@@ -149,9 +165,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -225,24 +241,24 @@ def ref_program(A, B):
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_f16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.float16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16bf16f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
new file mode 100644
index 000000000..99458f1c8
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -0,0 +1,151 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import pytest
+
+
+@tilelang.jit
+def simple_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[i] = 0
+
+    return main
+
+
+@tilelang.jit
+def nested_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A // 64):
+                for j in T.Parallel(64):
+                    data_frag[i * 64 + j] = 0
+
+    return main
+
+
+@tilelang.jit
+def invalid_loop_with_complex_dataflow(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[64 // 2 + i % 64] = 0
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_use_loop_var(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):  # noqa: B007
+                for j in T.Parallel(64):
+                    data_frag[j] = 0  # This is valid because we don't use i
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_frag(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_shared[i] = 0  # Valid because this is shared memory
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_serial(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.serial(A):
+                data_shared[i] = 0  # Valid because this is serial
+
+    return main
+
+
+def test_invalid_loop():
+    with pytest.raises(ValueError):
+        simple_invalid_loop()
+    with pytest.raises(ValueError):
+        nested_invalid_loop()
+    with pytest.raises(ValueError):
+        invalid_loop_with_complex_dataflow()
+
+
+def test_valid_loop():
+    valid_loop_not_use_loop_var()
+    valid_loop_not_frag()
+    valid_loop_serial()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
new file mode 100644
index 000000000..664fda5b8
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -0,0 +1,719 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import pytest
+
+tilelang.testing.set_random_seed()
+
+
+def _require_cuda_tensor(shape, dtype=torch.float32):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+"""
+Nested Parallel cases:
+
+T.Parallel
+    T.Parallel
+
+Rule:
+    - continuous parallels is allowed and will be merged into one T.Parallel.
+    - Non-continuous (e.g. with some statements in the outer-loop) are forbidden.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                B[i] = 0
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_parallels():
+    kernel1 = nested_continuous_parallels(length=256, block=16)
+    kernel2 = nested_triple_continuous_parallels(length=256, block1=8, block2=2)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is invalid
+    with pytest.raises(ValueError):
+        nested_noncontinuous_parallels(length=256, block=16)
+
+
+"""
+Nested Pipeline cases:
+
+T.Pipeline
+    T.Pipeline
+
+is OK.
+"""
+
+
+def matmul_nested_pipelines(
+    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            for _ in T.Pipelined(extra_pipeline_repeats):
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    if trans_A:
+                        T.copy(A[k * block_K, by * block_M], A_shared)
+                    else:
+                        T.copy(A[by * block_M, k * block_K], A_shared)
+                    if trans_B:
+                        T.copy(B[bx * block_N, k * block_K], B_shared)
+                    else:
+                        T.copy(B[k * block_K, bx * block_N], B_shared)
+                    T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_nested_pipelines(
+    order,
+    stage,
+    extra_pipeline_repeats,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    trans_A = False
+    trans_B = False
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+    program = matmul_nested_pipelines(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+        extra_pipeline_repeats,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_nested_pipelines():
+    run_gemm_nested_pipelines(order=[0, 1, 2], stage=[0, 0, 1], extra_pipeline_repeats=3)
+
+
+"""
+Nested serial cases:
+
+T.serial
+    T.serial
+
+is OK.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                B[i] = 0
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_serials():
+    kernel1 = nested_continuous_serials(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is valid
+    nested_noncontinuous_serials(length=256, block=16)
+
+
+"""
+Mixed serial and Parallel loops:
+
+(S-P)
+T.serial
+    T.Parallel
+
+(P-S)
+T.Parallel
+    T.serial
+
+Rule:
+    - No Parallel - * - Parallel
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sp(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_ps(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_psp(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.serial(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sps(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.serial(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+def test_mixed_sp():
+    kernel1 = nested_continuous_sp(length=256, block=16)
+    kernel2 = nested_continuous_ps(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This should be invalid (Undefined behaviour)
+    with pytest.raises(ValueError):
+        nested_continuous_psp(length=256, block1=16, block2=8)
+
+    kernel3 = nested_continuous_sps(length=256, block1=8, block2=2)
+    result3 = kernel3(data)
+    torch.testing.assert_close(result3, data + 1.0, atol=1e-5, rtol=1e-5)
+
+
+"""
+Mixed Pipelined and Parallel loops:
+
+(Pi-Pa)
+T.Pipelined
+    T.Parallel
+
+(Pa-Pi)
+T.Parallel
+    T.Pipelined
+
+Rule:
+    - Pi-Pa is ok where Pa-Pi is not allowed.
+    - For more nested cases, refer to the rule of T.Parallel.
+"""
+
+
+def matmul_nested_pipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmul_nested_papipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for _ in T.Parallel(1):
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    for i, j in T.Parallel(block_M, block_K):
+                        A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                    for i, j in T.Parallel(block_K, block_N):
+                        B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                    # T.copy(A[by * block_M, k * block_K], A_shared)
+                    # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_mixed_pp(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_nested_papipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+def test_mixed_pp():
+    run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
+
+
+"""
+TiledOp in a T.Parallel is also not permitted.
+"""
+
+
+def matmul_with_parallel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                for _ in T.Parallel(1):
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_tiled_op_with_parallel(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_with_parallel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+@tilelang.jit(out_idx=[1])
+def tir_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = T.max(A[i * block + j], 0.0)
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def customize_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j]
+                    T.atomic_add(B[i * block + j], 1.0)
+
+    return main
+
+
+def test_tiled_op_with_parallel():
+    run_gemm_tiled_op_with_parallel(order=[0, 1, 2], stage=[0, 0, 1])
+
+    kernel1 = tir_op_with_parallel(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, torch.relu(data), atol=1e-5, rtol=1e-5)
+    kernel2 = customize_op_with_parallel(length=256, block=16)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result2, data + 1, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
new file mode 100644
index 000000000..45bd86d0d
--- /dev/null
+++ b/testing/python/arith/test_arith_hard.py
@@ -0,0 +1,100 @@
+import tilelang.testing
+import tilelang.language as T
+from tvm.arith import Analyzer
+from tvm.ir.expr import Range
+from tvm.tir.expr import Not, Or
+from tvm.tir import all as tir_all
+
+
+def implies(x, y):
+    return Or(Not(x), y)
+
+
+def test_hard_prove():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+    d = T.Var("d", T.int32)
+
+    def check_expr(expr):
+        analyzer = Analyzer()
+        result = analyzer.can_prove(expr, 1)
+        if not result:
+            smtlib2 = analyzer.get_smtlib2(expr)
+            raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
+
+    def complex_expr_1():
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
+
+    check_expr(complex_expr_1())
+
+    def complex_expr_2():
+        return implies(tir_all(a < b, b < c, a * d < b * d), b * d < c * d)
+
+    check_expr(complex_expr_2())
+
+    def complex_expr_3():
+        return implies(tir_all(a >= 0, a < 128), a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+
+    check_expr(complex_expr_3())
+
+    def complex_expr_4():
+        return implies(
+            tir_all(a >= 0, a < 128),
+            (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
+            == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
+        )
+
+    check_expr(complex_expr_4())
+
+
+def test_smtlib2():
+    import z3
+
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    def complex_expr_1():
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
+
+    e = complex_expr_1()
+    analyzer = Analyzer()
+    analyzer.set_z3_timeout_ms(1000)
+    smtlib2 = analyzer.get_smtlib2(e)
+
+    solver = z3.Solver()
+    solver.from_string(smtlib2)
+    assert solver.check() == z3.unsat, f"Expected unsat, got {solver.check()}"
+
+
+def test_bind():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    analyzer = Analyzer()
+    analyzer.bind(a, Range(1, 100000))
+    analyzer.bind(b, Range(1, 100000))
+    analyzer.bind(c, Range(1, 100000))
+
+    expr = ((b - a) // c) * c + a <= b
+    smtlib2 = analyzer.get_smtlib2(expr)
+    try:
+        result = analyzer.can_prove(expr, 1)
+        assert result, f"Failed to prove with bindings: {expr}"
+    except Exception as e:
+        print(smtlib2)
+        raise e
+
+
+def test_divmod():
+    analyzer = Analyzer()
+    a = T.Var("a", T.int32)
+
+    assert not analyzer.can_prove(a % 2 % -2 - a % 2 == 0)
+    assert analyzer.can_prove(a % -2 % 2 - a % 2 == 0)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_intset.py b/testing/python/arith/test_arith_intset.py
new file mode 100644
index 000000000..e3fc7889f
--- /dev/null
+++ b/testing/python/arith/test_arith_intset.py
@@ -0,0 +1,379 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tvm.testing
+from tvm import te
+from tvm import tir
+from tvm.arith.analyzer import Analyzer
+
+
+class IntSetChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, dmap, expected):
+        res = self.analyzer.int_set(data, dmap)
+
+        def err_msg():
+            return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
+
+        assert self.analyzer.can_prove_equal(res.min_value, expected[0]), err_msg()
+        assert self.analyzer.can_prove_equal(res.max_value, expected[1]), err_msg()
+
+
+def test_basic():
+    s = tvm.arith.IntervalSet(2, 3)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 3
+
+    s = tvm.arith.IntSet.single_point(2)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 2
+
+
+def test_vector():
+    base = 10
+    stride = 3
+    lanes = 2
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, stride, lanes))
+    assert s.min_value.value == base
+    assert s.max_value.value == base + stride * (lanes - 1)
+
+
+def test_scalable_vector():
+    base = 5
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, 2, tvm.tir.vscale() * 4))
+
+    assert s.min_value.value == base
+    assert s.max_value.same_as(tvm.arith.int_set.pos_inf())
+
+
+def test_add_sub():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (1, 21))
+    ck.verify(x - y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (-11, 9))
+
+
+def test_mul_div():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+
+    tdiv = tvm.tir.truncdiv
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x * y, {x: tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
+    ck.verify(x * 2, {x: tvm.arith.IntervalSet(1, 10)}, (2, 20))
+    ck.verify(x * -2, {x: tvm.arith.IntervalSet(1, 10)}, (-20, -2))
+
+    ck.verify(tdiv(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, tdiv(10, y)))
+    ck.verify(tdiv(x, 2), {x: tvm.arith.IntervalSet(1, 10)}, (0, 5))
+
+    fld = tvm.te.floordiv
+    ck.verify(fld(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, fld(10, y)))
+    ck.verify(fld(x, 2), {x: tvm.arith.IntervalSet(-1, 10)}, (-1, 5))
+
+
+def test_mod():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    tmod = tvm.tir.truncmod
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(tmod(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
+    ck.verify(tmod(x, 10), {x: tvm.arith.IntervalSet(1, 10)}, (0, 9))
+
+    flm = tvm.te.floormod
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(-10, 10)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 5)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(13, 15)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 15)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 11)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(1, 21)}, (0, 9))
+
+    fld = tvm.te.floordiv
+    z = te.var("z")
+    ck.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 3))
+    ck.verify(
+        flm(y, 8),
+        {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)},
+        (
+            z * 8 + x * 4 - 8 * fld(z * 8 + x * 4, 8),
+            z * 8 + x * 4 + 3 - 8 * fld(z * 8 + x * 4, 8),
+        ),
+    )
+    ck1 = IntSetChecker()
+    ck1.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 2))
+    ck1.verify(flm(y, 8), {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)}, (x * 4, x * 4 + 3))
+
+
+def test_max_min():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(tvm.te.max(x, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.te.min(x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.te.min(x, y), {}, (tvm.te.min(x, y), tvm.te.min(x, y)))
+    ck.verify(tvm.te.max(x, y), {}, (tvm.te.max(x, y), tvm.te.max(x, y)))
+
+
+def test_select():
+    ck = IntSetChecker()
+    # x, y = te.var("x"), te.var("y")
+    x = te.var("x")
+    ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
+
+
+def check_region_bound(expect_region, var_dom, mode, predicate=None):
+    """Helper to check region bound estimation.
+
+    Parameters
+    ----------
+    expect_region: dict
+        The keys are of form (begin, end) or PrimExpr as a single point. The values are
+        expected estimated region or region dict on different bindings.
+
+    var_dom: dict
+        Map var to iteration domain range.
+
+    mode: str
+        Specify "lowerbound", "upperbound" or else use strict bound estimation.
+
+    predicate: PrimExpr
+        Extra predicate, defaults to True.
+    """
+    if predicate is None:
+        predicate = tvm.tir.IntImm("bool", 1)
+    region = []
+    expect = []
+    for k, v in expect_region.items():
+        if not isinstance(k, (tuple, list)):
+            k = (k, k + 1)
+        region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0])))
+        expect.append(v)
+    if mode == "lowerbound":
+        result = tvm.arith.estimate_region_lower_bound(region=region, var_dom=var_dom, predicate=predicate)
+    elif mode == "upperbound":
+        result = tvm.arith.estimate_region_upper_bound(region=region, var_dom=var_dom, predicate=predicate)
+    else:
+        result = tvm.arith.estimate_region_strict_bound(region=region, var_dom=var_dom, predicate=predicate)
+    if result is None:
+        assert all([_ is None for _ in expect])
+        return
+    assert len(result) == len(expect)
+    for intset, expect_desc in zip(result, expect):
+        if isinstance(expect_desc, dict):
+            # check range on different free var bindings
+            for binding in expect_desc:
+                analyzer = Analyzer()
+                for k, v in binding:
+                    analyzer.bind(k, v)
+                expect_begin, expect_end = expect_desc[binding]
+                result_begin = analyzer.simplify(intset.min_value, 3)
+                result_end = analyzer.simplify(intset.max_value + 1, 3)
+                assert analyzer.can_prove_equal(result_begin - expect_begin, 0), f"{result_begin} vs {expect_begin}"
+                assert analyzer.can_prove_equal(result_end - expect_end, 0), f"{result_end} vs {expect_end}"
+        else:
+            # check range
+            expect_begin, expect_end = expect_desc
+            analyzer = Analyzer()
+            assert analyzer.can_prove_equal(intset.min_value - expect_begin, 0), f"{intset.min_value} vs {expect_begin}"
+            assert analyzer.can_prove_equal(intset.max_value - expect_end + 1, 0), f"{intset.max_value} vs {expect_end - 1}"
+
+
+def test_region_bound_not_independent():
+    # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound")
+
+    # when only a subset of access indices are affine
+    i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=16),
+        j: tvm.ir.Range(begin=0, end=16),
+        k: tvm.ir.Range(begin=0, end=16),
+    }
+    check_region_bound(
+        {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="upperbound",
+    )
+
+
+def test_region_bound_stride_too_wide():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {i: tvm.ir.Range(begin=0, end=64)}
+    check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound")
+
+
+def test_region_bound_small_stride():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_split_predicate():
+    x_o = tvm.tir.Var("xo", "int32")
+    x_i = tvm.tir.Var("xi", "int32")
+    x = x_o * 4 + x_i
+    var_dom = {
+        x_o: tvm.ir.Range(begin=0, end=16),
+        x_i: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound")
+
+    check_region_bound(
+        {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)},
+        var_dom,
+        predicate=x < 63,
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_multiple_variables():
+    div = tvm.tir.floordiv
+    mod = tvm.tir.floormod
+    x = tvm.tir.Var("x", "int32")
+    wid = tvm.tir.Var("wid", "int32")
+    i = div(x, 16)
+    j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4
+    k = wid % 32
+    var_dom = {
+        x: tvm.ir.Range(begin=0, end=32),
+        wid: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_negative_scale():
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=4),
+        j: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_for_non_perfect_tile():
+    h1 = tvm.tir.Var("h1", "int32")
+    h2 = tvm.tir.Var("h2", "int32")
+    h3 = tvm.tir.Var("h3", "int32")
+
+    # non-uniform tiling, single inner variable
+    var_dom = {
+        h2: tvm.ir.Range(begin=0, end=10),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),  # h3 == 0: region is [1, 10)
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),  # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10)
+                ((h3, 27),): (h3 * 8, 224),  # h3 > 26: region is [h3 * 8, 224)
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 >= 1, h3 * 8 + h2 < 224),
+        mode="lowerbound",
+    )
+
+    # non-uniform tiling, two inner variables
+    var_dom = {
+        h1: tvm.ir.Range(begin=0, end=5),
+        h2: tvm.ir.Range(begin=0, end=2),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2 * 5 + h1: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),
+                ((h3, 27),): (h3 * 8, 224),
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h2 * 5 + h1 < 224),
+        mode="lowerbound",
+    )
+
+    # lowerbound should fail on incompatible predicates
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: None},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_unfusable():
+    var_dom = {
+        tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
+        tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
+    }
+    i, j = var_dom
+    check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound")
+
+
+def test_union_lower_bound():
+    neg_inf = tvm.arith.int_set.neg_inf()
+    pos_inf = tvm.arith.int_set.pos_inf()
+    set_0 = tvm.arith.IntervalSet(min_value=neg_inf, max_value=0)
+    set_1 = tvm.arith.IntervalSet(min_value=1, max_value=pos_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+    set_2 = tvm.arith.IntervalSet(min_value=pos_inf, max_value=neg_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1, set_2])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+
+
+def test_modular_set():
+    ck = IntSetChecker()
+    x = tvm.te.var("x", dtype="int32")
+    y = tvm.te.var("y", dtype="int32")
+    expr = (x * 2048 + y * 16) % 7168
+    ck.verify(expr, {x: tvm.arith.IntervalSet(0, 128), y: tvm.arith.IntervalSet(0, 3584)}, (0, 7152))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/testing/python/arith/test_arith_iter_affine_map.py b/testing/python/arith/test_arith_iter_affine_map.py
new file mode 100644
index 000000000..7a666f87d
--- /dev/null
+++ b/testing/python/arith/test_arith_iter_affine_map.py
@@ -0,0 +1,1292 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tilelang.testing
+from tvm.tir import floordiv, floormod
+from tvm.script import tir as T
+
+
+def ifuse(inputs, pred_extent=None):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return value, extent if pred_extent is None else pred_extent
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def convert_iter_expr(expr):
+    return tvm.arith.normalize_iter_map_to_expr(expr)
+
+
+def assert_iter_sum_pattern(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_map_simplify(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    _imap = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    res = tvm.arith.iter_map_simplify(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    for i, input_expr in enumerate(keys):
+        expected_expr = expect_dict[input_expr]
+        tvm.ir.assert_structural_equal(res[i], expected_expr)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(list(iters), dom_map, predicate=predicate, check_level=check_level).indices
+    assert len(res) == 0
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
+
+    # not independent
+    assert_iter_sum_failure([x, x, 3], dom_map)
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False)
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
+
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
+
+    # fuse with symbolic factor
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
+
+    # duplication
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+
+    # factor mismatch
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+
+    # simple stride pattern
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
+
+    # simple stride pattern with symbolic
+    assert_iter_sum_pattern({x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)]))
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
+
+    assert_iter_sum_pattern({fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)]))
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
+
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
+
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
+
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
+
+    # domain of x is undefined
+    assert_iter_sum_pattern({fld(flm(x, 49) + y, 49): (1, fld(flm(x, 49) + y, 49))}, var_dom([(y, 1)]))
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
+    xoscale = 3
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
+
+
+def test_compound_floormod_two_regression():
+    x = tvm.tir.Var("x", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    # regression
+    # extent of 2 of negative scale cannot be normalized
+    assert_iter_sum_failure(
+        [fld(x, 2) * 2 - flm(x, 2) + 1],
+        dom_map=var_dom([(x, 8)]),
+    )
+
+
+def test_predicate():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # available constraints
+    # upper bound only
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128)
+
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127)
+
+    # lower bound only
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5)
+
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6)
+
+    # lower bound + upper bound
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 64 + y * 4 + z: (16, 16)},
+        var_dom([(x, 16), (y, 16), (z, 4)]),
+        predicate=tvm.tir.And(x * 64 + y * 4 + z < 32, x * 16 + y >= 4),
+    )
+
+    # constraints on one fused iter
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    k = tvm.tir.Var("k", "int32")
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9),
+    )
+
+    # constraints on single var
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
+
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
+        [i, j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
+    )
+
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
+        [i * 128 + j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=i * 16384 + j * 128 + k < 100,
+    )
+
+    # irrelevant predicate
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
+
+    # constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9, i * 8 + j * 2 + k >= 3, i * 8 + j * 2 + k < 25),
+    )
+
+    # duplicate constraint on one fused iter
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k >= 2, j * 2 + k < 8, j * 2 + k < 9),
+    )
+
+    # duplicate constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(
+            j * 2 + k >= 1,
+            j * 2 + k >= 2,
+            j * 2 + k < 8,
+            j * 2 + k < 9,
+            i * 6 + j * 2 + k >= 3,
+            i * 6 + j * 2 + k < 25,
+            i * 6 + j * 2 + k >= 1,
+            i * 6 + j * 2 + k < 18,
+        ),
+    )
+
+    # constraint on non-disjoint fused iters should fail
+    assert_iter_sum_failure(
+        [i * 8 + j * 2 + k],
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 2, i * 4 + j >= 0),
+    )
+
+    # constraints with different lower bound
+    assert_iter_sum_pattern(
+        {
+            (i * 16 + j) // 23 * 8 + (i * 16 + j) % 23 - 15: (
+                64,
+                0,
+                1,
+                (i * 16 + j) // 23 * 8 + ((i * 16 + j) % 23 + tvm.tir.IntImm("int32", -15)),
+            )
+        },
+        var_dom([(i, 12), (j, 16)]),
+        predicate=tvm.tir.And(
+            tvm.tir.And(i * 16 + j < 184, tvm.tir.LE(tvm.tir.IntImm("int32", 8), (i * 16 + j) % 23)),
+            tvm.tir.LE(tvm.tir.IntImm("int32", 15), (i * 16 + j) % 23),
+        ),
+    )
+
+    # constraint on many disjoint fused iters, case 1
+    # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
+    # i2 * 30 + i3 * 15 in [30, 90), extent=60 (= scale of i1)
+    # i1 * 60 in [60, 240), extent=180 (= scale of i0)
+    i0 = tvm.tir.Var("i0", "int32")
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    i5 = tvm.tir.Var("i5", "int32")
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
+        var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
+        predicate=tvm.tir.all(i1 >= 1, i2 * 2 + i3 >= 2, i4 * 6 + i5 >= 3),
+    )
+
+    # constraint on many disjoint fused iters, case 2
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
+        var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
+        predicate=tvm.tir.all(i1 * 5 + i2 >= 3, i1 * 5 + i2 < 8, i3 * 4 + i4 >= 1, i3 * 4 + i4 < 10),
+    )
+
+    # constraint on split iters
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
+        var_dom([(i, 1024)]),
+        predicate=tvm.tir.all(i % 16 >= 3, i % 16 < 10, i // 16 >= 4, i // 16 < 12),
+        check_level="bijective",
+    )
+
+    # constraint on split iters, nested case 1
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all((i * 32 + j) % 16 >= 3, (i * 32 + j) % 16 < 10),
+    )
+
+    # constraint on split iters, nested case 2
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+        check_level="bijective",
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 64),
+    )
+
+    # non-standard form of predicate
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y)
+
+    # duplicate constraint
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
+    )
+
+    # useless constraint
+    assert_iter_sum_pattern({x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140)
+
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+            )
+        ),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
+            )
+        ),
+    )
+
+    # incompatible constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+
+    # zero iter
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
+    )
+
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
+
+def convert_division(divisions):
+    if divisions is None or len(divisions) == 0:
+        return []
+    res = []
+    for division in divisions[:-1]:
+        res.append(
+            [
+                tvm.arith.normalize_iter_map_to_expr(division[0].source),
+                tvm.arith.normalize_iter_map_to_expr(division[1].source),
+            ]
+        )
+    res.append([divisions[-1][0].extent, divisions[-1][1].extent])
+    return res
+
+
+def create_iter(name, extent):
+    return tvm.tir.Var(name, "int32"), extent
+
+
+def test_subspace_division():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+
+    # simple 1.1
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x])
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+
+    # simple 1.2
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x], z * 4 + y < 18)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+    tvm.ir.assert_structural_equal(res[1][0], z * 4 + y < 18)
+    tvm.ir.assert_structural_equal(res[1][1], T.bool(True))
+
+    # compound 1
+    i0 = create_iter("i0", 4)
+    j0 = create_iter("j0", 8)
+    i3 = create_iter("i3", 2)
+
+    i1, i2 = isplit(j0, 4)
+    k0 = ifuse([i0, i1])
+    k1 = ifuse([i2, i3])
+
+    # compound 1.1
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+
+    # assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.2
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.3
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 1.4
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], k0[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
+    tvm.ir.assert_structural_equal(res[2][1], T.bool(True))
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.5
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]], k1[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], T.bool(True))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.6
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], tvm.tir.all(k0[0] < 7, k1[0] < 7))
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2
+    j0 = create_iter("j0", 4)
+    l0 = create_iter("l0", 2)
+    l1 = create_iter("l1", 6)
+    j3 = create_iter("j3", 3)
+
+    k0 = ifuse([l0, l1])
+    i1, j2 = isplit(k0, 3)
+    j1, i1 = isplit(i1, 2)
+    i0 = ifuse([j0, j1])
+    i2 = ifuse([j2, j3])
+
+    # compound 2.1
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.2
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], j0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(l0[0] * 6 + l1[0], 6))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(floormod(l0[0] * 6 + l1[0], 6), 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
+    assert len(res2) == 3
+
+    # compound 2.3
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2.4
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0], i2[0]],
+        var_dom([j0, l0, l1, j3]),
+        [l1[0], j3[0]],
+        tvm.tir.all(i0[0] < 7, i2[0] < 8),
+    )
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+    tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
+    tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.5
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [j3[0]], i2[0] < 8)
+    res = convert_division(res)
+    assert len(res) == 0
+
+
+def test_subspace_divide_trivial_iters():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    # z = tvm.tir.Var("z", "int32")
+
+    # trivial 1.1
+    res = tvm.arith.subspace_divide([x * 16 + y], var_dom([(x, 1), (y, 16)]), [y], simplify_trivial_iterators=False)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], y)
+
+    # trivial 1.2
+    res = tvm.arith.subspace_divide(
+        [x, y],
+        var_dom([(x, 1), (y, 1)]),
+        [y],
+        simplify_trivial_iterators=False,
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], y)
+
+
+def test_complex():
+    n0 = create_iter("n0", 2)
+    n1 = create_iter("n1", 4)
+
+    m0 = ifuse([n0, n1], 6)
+    m1 = create_iter("m1", 3)
+
+    l0 = create_iter("l0", 4)
+    l1 = create_iter("l1", 8)
+    l2 = ifuse([m0, m1], 16)
+    l3 = create_iter("l3", 32)
+
+    k0, k4 = isplit(l0, 2)
+    k1, k5 = isplit(l1, 2)
+    k2, k6 = isplit(l2, 4)
+    k3, k7 = isplit(l3, 4)
+
+    j0 = ifuse([k0, k1], 7)
+    j1 = ifuse([k2, k3])
+    j2 = ifuse([k4, k5])
+    j3 = ifuse([k6, k7], 15)
+
+    i0 = ifuse([j0, j1], 200)
+    i1 = ifuse([j2, j3], 50)
+
+    n0_mark = tvm.arith.IterMark(n0[0], n0[1])
+    n1_mark = tvm.arith.IterMark(n1[0], n1[1])
+    l0_mark = tvm.arith.IterMark(l0[0], l0[1])
+    l1_mark = tvm.arith.IterMark(l1[0], l1[1])
+    m1_mark = tvm.arith.IterMark(m1[0], m1[1])
+    l3_mark = tvm.arith.IterMark(l3[0], l3[1])
+
+    m0_expr = tvm.arith.IterSumExpr(
+        [
+            tvm.arith.IterSplitExpr(n0_mark, 1, n0[1], 4),
+            tvm.arith.IterSplitExpr(n1_mark, 1, n1[1], 1),
+        ],
+        0,
+    )
+    m0_mark = tvm.arith.IterMark(m0_expr, 6)
+    l2_expr = tvm.arith.IterSumExpr(
+        [tvm.arith.IterSplitExpr(m0_mark, 1, 6, 3), tvm.arith.IterSplitExpr(m1_mark, 1, m1[1], 1)],
+        0,
+    )
+    l2_mark = tvm.arith.IterMark(l2_expr, 16)
+    k0_expr = tvm.arith.IterSplitExpr(l0_mark, 2, 2, 4)
+    k1_expr = tvm.arith.IterSplitExpr(l1_mark, 2, 4, 1)
+    k2_expr = tvm.arith.IterSplitExpr(l2_mark, 4, 4, 8)
+    k3_expr = tvm.arith.IterSplitExpr(l3_mark, 4, 8, 1)
+    k4_expr = tvm.arith.IterSplitExpr(l0_mark, 1, 2, 30)
+    k5_expr = tvm.arith.IterSplitExpr(l1_mark, 1, 2, 15)
+    k6_expr = tvm.arith.IterSplitExpr(l2_mark, 1, 4, 4)
+    k7_expr = tvm.arith.IterSplitExpr(l3_mark, 1, 4, 1)
+
+    j0_expr = tvm.arith.IterSumExpr([k0_expr, k1_expr], 0)
+    j0_mark = tvm.arith.IterMark(j0_expr, 7)
+    i0_expr = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(j0_mark, 1, 7, 32), k2_expr, k3_expr], 0)
+
+    j3_expr = tvm.arith.IterSumExpr([k6_expr, k7_expr], 0)
+    j3_mark = tvm.arith.IterMark(j3_expr, 15)
+    i1_expr = tvm.arith.IterSumExpr([k4_expr, k5_expr, tvm.arith.IterSplitExpr(j3_mark, 1, 15, 1)], 0)
+
+    i0_mark = tvm.arith.IterMark(i0_expr, i0[1])
+    i1_mark = tvm.arith.IterMark(i1_expr, i1[1])
+
+    i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
+    i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
+
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
+    )
+
+    # subspace_division
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        [n0[0], n1[0], m1[0], l3[0]],
+        tvm.tir.all(m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], floordiv(l0[0], 2) * 4 + floordiv(l1[0], 2))
+    tvm.ir.assert_structural_equal(res[0][1], (floordiv((n0[0] * 4 + n1[0]) * 3 + m1[0], 4) * 8) + floordiv(l3[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], ((floormod(l0[0], 2) * 2) + floormod(l1[0], 2)))
+    tvm.ir.assert_structural_equal(res[1][1], ((floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4) + floormod(l3[0], 4)))
+    tvm.ir.assert_structural_equal(res[2][0], (floordiv(l0[0], 2) * 4) + floordiv(l1[0], 2) < 7)
+    tvm.ir.assert_structural_equal(
+        res[2][1],
+        tvm.tir.all(
+            n0[0] * 4 + n1[0] < 6,
+            (n0[0] * 4 + n1[0]) * 3 + m1[0] < 16,
+            floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4 + floormod(l3[0], 4) < 15,
+        ),
+    )
+
+    assert_iter_sum_pattern({res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1])
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
+
+
+def test_normalize_iter_map_to_expr():
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
+
+    tvm.ir.assert_structural_equal(
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
+    )
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
+
+    # iter mark wrap a complex expr
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
+
+
+def test_inverse_affine_iter_map():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    l2 = create_iter("l2", 64)
+
+    # simple case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l0_1_l1_1_fused = ifuse([l0_1, l1_1])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 2
+    l0_inverse = floordiv(outputs[0], 4) + outputs[1] * 16
+    l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+
+    # compound case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l2_1, l2_2 = isplit(l2, 4)
+    l2_0, l2_1 = isplit(l2_1, 4)
+
+    l0_1_l2_1_l1_1_l2_0_fused = ifuse([l0_1, l2_1, l1_1, l2_0])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 3
+    l0_inverse = floordiv(outputs[0], 64) + outputs[1] * 16
+    l1_inverse = floormod(floordiv(outputs[0], 4), 4) + outputs[3] * 4
+    l2_inverse = floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+    assert analyzer.can_prove_equal(res[l2[0]], l2_inverse)
+
+    # diamond-shape DAG
+    l0_0, l0_1 = isplit(l0, 16)
+    l1 = ifuse([l0_1, l0_0])
+    l1_0, l1_1 = isplit(l1, 8)
+    l2 = ifuse([l1_1, l1_0])
+
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 1
+    l1_inverse = floormod(outputs[0], 8) * 8 + floordiv(outputs[0], 8)
+    l0_inverse = floormod(l1_inverse, 4) * 16 + floordiv(l1_inverse, 4)
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+
+
+def test_inverse_affine_map_trivial_iter():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    # output_0 is expected to be constant and it is not included in the inverse map
+    assert len(res) == 2
+    assert analyzer.can_prove_equal(res[l0[0]], outputs[1])
+    assert analyzer.can_prove_equal(res[l1[0]], outputs[2])
+
+
+def test_free_variables():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # illegal iter if z is within dom
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
+
+    # iter is valid if z is free, even there are linear forms of z
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+
+
+class TestPadding:
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    positive_test_case = tvm.testing.parameter(
+        # left padding only, offset divisible
+        ({y: 192}, {fld(64 + y, 32): (6, 2, 1), flm(64 + y, 32): (32, 0, 1)}, "bijective"),
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32): (6, 2, 1)}),
+        ({y: 176}, {flm(fld(80 + y, 2), 16): (16, 0, 1), flm(80 + y, 2): (2, 0, 1)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 16): (10, 0, 1), flm(x * 32 + y * 8, 16): (2, 0, 8)}),
+        # right padding only, offset non-divisible
+        ({x: 26}, {fld(x, 15): (2, 0, 1)}),
+        ({x: 26}, {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)}),
+        # padding constants on both side
+        ({x: 45}, {fld(x + 71, 32): (2, 2, 1)}),
+        ({x: 45}, {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)}),
+        # padding for free iteration part
+        ({y: 360}, {fld(x * 360 + y, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}),
+        ({y: 360}, {flm(x * 360 + y, 16): (16, 0, 1)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3): (3, 0),
+                flm(fld(x + 10, 3), 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+            },
+        ),
+        # different offsets on splits
+        (
+            {x: 240},
+            {
+                flm(x + 1, 3): (3, 0),
+                flm(fld(x + 10, 3) + 2, 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+            },
+        ),
+    )
+
+    negative_test_case = tvm.testing.parameter(
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32), flm(80 + y, 32)}),
+        ({y: 176}, {fld(80 + y, 32), fld(80 + y, 4)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 5)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3),
+                flm(fld(x + 10, 3), 4),
+                flm(fld(fld(x + 10, 3), 4), 5),
+                fld(fld(fld(x + 10, 3), 4), 5),
+            },
+        ),
+        # original extent is smaller than the divident
+        # it is not surjective wrt to the region [0, 16)
+        ({x: 3}, {flm(x, 16)}),
+        # (x % c1) // c2 is not proved as surjective if c1 % c2 != 0
+        ({x: 255}, {fld(flm(x, 255), 16)}),
+    )
+
+    def test_padding(self, positive_test_case):
+        iter_extent, mapped_iterators, *args = positive_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_pattern(mapped_iterators, dom_map, check_level=check_level)
+
+    def test_padding_error(self, negative_test_case):
+        iter_extent, mapped_iterators, *args = negative_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_failure(mapped_iterators, dom_map, check_level=check_level)
+
+
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, x * 6 + y > 1, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern({x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective")
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
+def test_iter_map_simplify_symbolic_case():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = x * 32 + y
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    assert_iter_map_simplify({simple_fuse0(x): x}, var_dom([(x, n * 32)]))
+
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, n), (y, 32)]))
+
+    def fsymbolic_fuse0(x):
+        return ((x // (n * n)) % 32) * (n * n) + ((x // n) % n) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse0(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse0(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse1(x):
+        return ((x % (n * n * 32)) // (n * n) * n + (x % (n * n) // n)) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse1(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse1(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    assert_iter_map_simplify({fsymbolic_fuse2(x): x}, var_dom([(x, n * n * 32)]))
+
+
+def test_iter_map_simplify_symbolic_predicate():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    z = x * 32 + y
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, (n + 1) // 2), (y, 32)]), predicate=(z < n * 16))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    z = x * 64 + y
+    assert_iter_map_simplify(
+        {fsymbolic_fuse2(z): z},
+        var_dom([(x, (n * n + 1) // 2), (y, 64)]),
+        predicate=(z < n * n * 32),
+    )
+
+
+def test_iter_map_simplify_symbolic_reshape():
+    n = tvm.tir.Var("n", "int64")
+    fused = tvm.tir.Var("fused", "int64")
+
+    ax0 = (fused // 4096) // n
+    ax1 = (fused // 4096) % n
+    ax2 = fused % 4096
+
+    rhs_index = ((ax2 // 4096 + ax0 * n + ax1) % n) * 4096 + ax2 % 4096
+
+    assert_iter_map_simplify({rhs_index: fused}, var_dom([(fused, n * 4096)]))
+
+
+def test_iter_map_simplify_unit_loop_order():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+
+    # trivial iterators can be found at any when comparing via scale
+    # ensure order unchange
+    assert_iter_map_simplify({x + y + z: x + y + z}, var_dom([(x, 1), (y, 1), (z, 1)]), simplify_trivial_iterators=False)
+
+    # Even with simplification, it should follow the original order
+    assert_iter_map_simplify(
+        {x + y + (z // 4) * 4 + z % 4: z + x + y},
+        var_dom([(x, 1), (y, 1), (z, 32)]),
+        simplify_trivial_iterators=False,
+    )
+
+    assert_iter_map_simplify(
+        {y + 64 - x % 2 * 64: y + 64 - x % 2 * 64},
+        var_dom([(x, 6), (y, 64)]),
+        simplify_trivial_iterators=False,
+    )
+
+    # When we have iterators that have same scale but one of them come
+    # with unit extent, we should prioritize unit extent
+    assert_iter_map_simplify(
+        {x // 128 + y + z: y + z},
+        var_dom([(x, 128), (y, 128), (z, 1)]),
+        simplify_trivial_iterators=False,
+    )
+
+
+def assert_normalize_to_iter_sum(index, input_iters, args, base):
+    """Assert the result of arith.normalize_to_iter_sum is correct
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index to be normalized
+    input_iters : Mapping[Var, Range]
+        The input iterators
+    args : List[Union[tvm.arith.IterSplitExpr, Tuple[PrimExpr, PrimExpr]]]
+        The expected result. Ordered list of args of the expected IterSumExpr. Each arg can be
+        either IterSplitExpr or a tuple of (PrimExpr, PrimExpr) where the first element is the
+        iterator normalized to PrimExpr and the second element is the scale.
+    base : tvm.tir.PrimExpr
+        The expected base
+    """
+    res = tvm.arith.normalize_to_iter_sum(index, input_iters)
+
+    assert isinstance(res, tvm.arith.IterSumExpr)
+    assert len(res.args) == len(args)
+    for split, item in zip(res.args, args):
+        if isinstance(item, tvm.arith.IterSplitExpr):
+            tvm.ir.assert_structural_equal(split, item)
+            continue
+        tvm.testing.assert_prim_expr_equal(split.scale, item[1])
+        tvm.testing.assert_prim_expr_equal(tvm.arith.normalize_iter_map_to_expr(split), item[0] * item[1])
+    tvm.testing.assert_prim_expr_equal(res.base, base)
+
+
+def test_normalize_to_iter_sum():
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+    a = tvm.tir.Var("a", "int64")
+    n = tvm.tir.Var("n", "int64")
+    # flm = tvm.tir.floormod
+
+    assert_normalize_to_iter_sum(
+        z + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n), (z, 1)],
+        2 * n + 3,
+    )
+
+    # max cannot detected so it goes into base
+    assert_normalize_to_iter_sum(
+        tvm.tir.max(z, a) + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n)],
+        tvm.tir.max(z, a) + 2 * n + 3,
+    )
+
+    # order by symbolic prod
+    assert_normalize_to_iter_sum(
+        z + ((y * 4 * a + x * 4 + 2) * n) + 3,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, a * n * 4), (x, n * 4), (z, 1)],
+        2 * n + 3,
+    )
+
+    # order by cscale
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * x,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x, 4), (z, 1)],
+        0,
+    )
+
+    # split pattern
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * (x // 2),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x // 2, 4), (z, 1)],
+        0,
+    )
+
+    # non-divisible
+    assert_normalize_to_iter_sum(
+        x // 5,
+        var_dom([(x, 4096)]),
+        [
+            tvm.arith.IterSplitExpr(
+                tvm.arith.IterMark(x, 4096),
+                lower_factor=tvm.tir.const(5, "int64"),
+                extent=tvm.tir.const(820, "int64"),
+                scale=tvm.tir.const(1, "int64"),
+            )
+        ],
+        0,
+    )
+
+    # iter simplify
+    assert_normalize_to_iter_sum(
+        z * 2 + 2 * y * 3 + 4 * (x // 4) + (x % 4),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (z, 2), (x, 1)],
+        0,
+    )
+
+
+def test_detect_iter_map_with_bufferload_recursion():
+    n = tvm.tir.Var("n", "int32")
+    m = tvm.tir.Var("m", "int32")
+    divisor = tvm.tir.Var("divisor", "int32")
+
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+
+    buffer = tvm.tir.decl_buffer((n,), "int32", name="seqlen")
+
+    indices = [(buffer[i] + j) // divisor]
+    iter_vars = {
+        i: tvm.ir.Range(tvm.tir.const(0, "int32"), n),
+        j: tvm.ir.Range(tvm.tir.const(0, "int32"), m),
+    }
+
+    result = tvm.arith.detect_iter_map(indices, iter_vars)
+    assert len(result.indices) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_simplify.py b/testing/python/arith/test_arith_simplify.py
new file mode 100644
index 000000000..7d6cf6d3d
--- /dev/null
+++ b/testing/python/arith/test_arith_simplify.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tilelang import tvm
+import tilelang.testing
+from tvm import tir
+import tvm.ir
+
+
+def test_simplify_reshape_flattened_index():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    ana.bind(i0, tvm.ir.Range(0, 8))
+    ana.bind(i1, tvm.ir.Range(0, 3))
+
+    i_flattened = i0 * 3 + i1
+    tvm.ir.assert_structural_equal(
+        ana.simplify((i_flattened) // 12 * 12 + (i_flattened) % 12 // 4 * 4 + (i_flattened) % 4),
+        i_flattened,
+    )
+
+
+dtype = tvm.testing.parameter(
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+)
+
+
+def test_can_prove_self_identity(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove(n == n)
+
+
+def test_can_prove_self_equal_to_self(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove_equal(n, n)
+
+
+def test_simplify_symbolic_comparison():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    n, m = tvm.tir.SizeVar("n", "int64"), tvm.tir.SizeVar("m", "int64")
+    outer = (n + 31) // 32
+    ana.bind(i0, tvm.ir.Range(0, outer))
+    ana.bind(i1, tvm.ir.Range(0, 32))
+    PS = tvm.arith.ProofStrength
+
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32 + m, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 + 1 <= (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1 + 1, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1, PS.SYMBOLIC_BOUND)
+
+
+def test_regression_simplify_inf_recursion():
+    ana = tvm.arith.Analyzer()
+    cond = tir.Var("cond", "int32")
+
+    res = (tvm.tir.NE(cond, 0).astype("int8") - tvm.tir.NE(cond, 0).astype("int8")).astype("int32") == 0
+    # regression in a previous case
+    # try compare and int set recursive call can cause infinite loop
+    ana.rewrite_simplify(res)
+
+
+def test_simplify_floor_mod_with_linear_offset():
+    """
+    Test that the floor_mod is simplified correctly when the offset is linear.
+    """
+    ana = tvm.arith.Analyzer()
+    past_decoder_sequence_length = tir.Var("past_decoder_sequence_length", "int64")
+    expr1 = (past_decoder_sequence_length + 1) * 64
+    divisor1 = (past_decoder_sequence_length + 1) * 32
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor1), 0)
+    divisor2 = 32 * (past_decoder_sequence_length + 1)
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor2), 0)
+
+
+def test_simplify_float_division():
+    # Test for the discussion:
+    # https://discuss.tvm.apache.org/t/discuss-is-constant-division-to-multiplication-rewrite-in-tvm-necessary/18615
+    ana = tvm.arith.Analyzer()
+    x = tir.Var("x", "float32")
+    ry = x / 27
+    # in old version, the division will be rewritten into x * T.float32(1 / 27)
+    sy = ana.rewrite_simplify(ry)
+    tvm.ir.assert_structural_equal(ry, sy)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 85e2e4807..f4b9709a8 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -48,6 +48,7 @@ def get_configs(M, N, K, with_roller=False):
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
         from tilelang.carver.roller.rasterization import NoRasterization
+
         arch = CUDA("cuda")
         topk = 20
 
@@ -56,9 +57,9 @@ def get_configs(M, N, K, with_roller=False):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -84,7 +85,6 @@ def get_configs(M, N, K, with_roller=False):
         for config in configs:
             print(config)
     else:
-
         block_M = [64]
         block_N = [64]
         block_K = [32]
@@ -100,7 +100,8 @@ def get_configs(M, N, K, with_roller=False):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -110,7 +111,8 @@ def get_configs(M, N, K, with_roller=False):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
@@ -185,14 +187,14 @@ def kernel(
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -206,9 +208,7 @@ def main(
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -247,20 +247,25 @@ def main(
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
-            target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_get_configs():
     get_configs(1024, 1024, 1024, with_roller=True)
     get_configs(1024, 1024, 1024, with_roller=False)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_matmul():
     matmul(1024, 1024, 1024, with_roller=True)
     matmul(1024, 1024, 1024, with_roller=False)
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 39efce6bf..4edea0b88 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -30,38 +30,23 @@ def ref_program(A, B):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64],
-        block_N=[64],
-        block_K=[32],
-        num_stages=[0, 1],
-        thread_num=[128],
-        enable_rasterization=[False])
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
+    iter_params = dict(block_M=[64], block_N=[64], block_K=[32], num_stages=[0, 1], thread_num=[128], enable_rasterization=[False])
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M=128,
-           block_N=128,
-           block_K=32,
-           num_stages=0,
-           thread_num=128,
-           enable_rasterization=False):
-
-    dtype = "float16"
-    accum_dtype = "float"
+def matmul(M, N, K, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -76,7 +61,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/testing/python/cache/test_tilelang_cache_matmul.py b/testing/python/cache/test_tilelang_cache_matmul.py
deleted file mode 100644
index 6e966a88a..000000000
--- a/testing/python/cache/test_tilelang_cache_matmul.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang.cache import cached
-import tilelang.language as T
-
-
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    """
-    Defines a matrix multiplication primitive function using tilelang.
-
-    This function constructs a tilelang primitive function for matrix multiplication,
-    optimized for execution on hardware accelerators. It utilizes shared memory and
-    fragment memory for performance.
-
-    Args:
-        M (int): Number of rows in matrix A and C.
-        N (int): Number of columns in matrix B and C.
-        K (int): Number of columns in matrix A and rows in matrix B.
-        block_M (int): Block size for M dimension in shared memory and fragment.
-        block_N (int): Block size for N dimension in shared memory and fragment.
-        block_K (int): Block size for K dimension in shared memory.
-        dtype (str, optional): Data type for input matrices A and B, and output C. Defaults to "float16".
-        accum_dtype (str, optional): Accumulation data type for internal computations. Defaults to "float".
-
-    Returns:
-        T.PrimFunc: A tilelang primitive function representing the matrix multiplication.
-    """
-
-    @T.prim_func
-    def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_cache_matmul():
-    """
-    Demonstrates the usage of the cached matrix multiplication kernel.
-
-    This function defines a reference PyTorch matrix multiplication,
-    creates a cached kernel from the tilelang matmul function,
-    runs the kernel with random input tensors, compares the output with the reference,
-    and prints the CUDA kernel source code.
-    """
-
-    def ref_program(A, B):
-        """
-        Reference PyTorch matrix multiplication for comparison.
-        """
-        import torch
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.half)  # Assuming dtype="float16" in matmul
-        return C
-
-    func = matmul(1024, 1024, 1024, 128, 128, 32)
-    kernel = cached(func, [2], execution_backend="cython")
-    import torch
-
-    a = torch.randn(1024, 1024).cuda().half()
-    b = torch.randn(1024, 1024).cuda().half()
-
-    c = kernel(a, b)
-    print("\nOutput from Cached Kernel:")
-    print(c)
-
-    ref_c = ref_program(a, b)
-    print("\nReference PyTorch Output:")
-    print(ref_c)
-
-    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-    print("\nOutputs are close (within tolerance).")
-
-    # Get CUDA Source
-    print("\nCUDA Kernel Source:")
-    print(kernel.get_kernel_source())
-
-
-def test_cache_matmul_f16f16f16_nn():
-    """
-    Test function for cached matrix multiplication (float16 inputs, float16 output, no transpose).
-    """
-    run_cache_matmul()
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/cache/test_tilelang_kernel_cache.py b/testing/python/cache/test_tilelang_kernel_cache.py
new file mode 100644
index 000000000..617bf76be
--- /dev/null
+++ b/testing/python/cache/test_tilelang_kernel_cache.py
@@ -0,0 +1,290 @@
+# Test Plan: Disk Cache Verification using PostProc Callback
+#
+# Purpose: Reliably test disk cache in CI by using postproc callbacks to detect
+#          whether compilation actually happened or cache was used.
+#
+# Strategy:
+# - postproc is ONLY called during codegen (cache miss)
+# - postproc is NOT called when loading from cache (cache hit)
+# - Use a counter in postproc to distinguish these cases
+#
+# CI Safety:
+# - Use isolated cache/tmp directories per test (pytest tmp_path)
+# - Use unique kernel identifiers (UUID + global_symbol) to avoid collisions
+# - Clear memory cache between passes to force disk I/O
+# - os.replace() requires source and dest on same filesystem (atomic rename)
+#
+# Technical Details:
+# - Cache key is based on func.script(show_meta=True) hash
+# - Python comments do NOT affect cache key (not in TIR)
+# - Must use .with_attr("global_symbol", ...) to create unique cache keys
+
+import pytest
+import tilelang
+import tilelang.language as T
+import tvm_ffi
+import torch
+import uuid
+from pathlib import Path
+from tilelang.env import env
+from tilelang.cache import _dispatch_map
+
+BACKENDS = [
+    "tvm_ffi",
+    "cython",
+    "nvrtc",
+    "cutedsl",
+]
+
+
+def _get_target_from_backend(backend: str):
+    """Map backend to target string."""
+    return "cutedsl" if backend == "cutedsl" else "auto"
+
+
+class PostProcCounter:
+    """Track postproc callback invocations with a simple counter."""
+
+    def __init__(self):
+        self.count = 0
+        self.marker = None
+
+    def register_callback(self, backend: str):
+        """Register postproc callback for the given backend."""
+        comment_prefix = "#" if backend == "cutedsl" else "//"
+        global_func = "tilelang_callback_cutedsl_postproc" if backend == "cutedsl" else "tilelang_callback_cuda_postproc"
+
+        def callback(code, _):
+            self.count += 1
+            self.marker = f"{comment_prefix} CACHE_TEST_MARKER_{self.count}"
+            return self.marker + "\n" + code
+
+        tvm_ffi.register_global_func(global_func, f=callback, override=True)
+        return callback
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_module_env():
+    """Setup and restore module-level environment and cache state."""
+    # Save original env values
+    original_cache_dir = env.TILELANG_CACHE_DIR
+    original_tmp_dir = env.TILELANG_TMP_DIR
+
+    # Enable cache once for entire module
+    tilelang.enable_cache()
+
+    yield
+
+    # Restore env at module end
+    env.TILELANG_CACHE_DIR = original_cache_dir
+    env.TILELANG_TMP_DIR = original_tmp_dir
+
+    # Restore default postproc callbacks
+    tvm_ffi.register_global_func("tilelang_callback_cuda_postproc", f=lambda code, _: code, override=True)
+    tvm_ffi.register_global_func("tilelang_callback_cutedsl_postproc", f=lambda code, _: code, override=True)
+
+
+@pytest.fixture(scope="function")
+def clean_cache_env(tmp_path, request):
+    """Provide isolated cache environment for each test.
+
+    Creates isolated cache/tmp directories to ensure:
+    - No interference from previous test runs
+    - No interference between parallel tests
+    - Clean slate for testing cache miss/hit behavior
+    - No "Invalid cross-device link" errors (os.replace requires same filesystem)
+
+    Technical notes:
+    - TILELANG_TMP_DIR MUST be on same filesystem as TILELANG_CACHE_DIR because
+      cache implementation uses os.replace() for atomic writes
+    - Env restoration is handled by setup_module_env at module scope
+    """
+    # This fixture should ONLY be used with @pytest.mark.parametrize("backend", ...)
+    backend = request.node.callspec.params["backend"]  # Will raise KeyError if missing
+
+    cache_dir = tmp_path / "tilelang_cache"
+    cache_dir.mkdir()
+
+    tmp_dir = tmp_path / "tilelang_tmp"
+    tmp_dir.mkdir()
+
+    # Patch env variables to point to isolated directories
+    env.TILELANG_CACHE_DIR = str(cache_dir)
+    env.TILELANG_TMP_DIR = str(tmp_dir)
+
+    # Clear memory caches to force disk I/O
+    _dispatch_map[backend]._memory_cache.clear()
+
+    return cache_dir
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_disk_cache_with_postproc(clean_cache_env, backend):
+    """Test disk cache for multiple backends using postproc callback.
+
+    Tests all CUDA-based backends: nvrtc, cutedsl
+    (tvm_ffi, cython, torch use the same cuda_postproc callback as nvrtc)
+
+    Verification logic:
+    1. Pass 1: cache miss → postproc called → marker in kernel source
+    2. Pass 2: cache hit → postproc NOT called → same marker still in source
+    3. Verify cache files created on disk
+    4. Verify functional correctness
+    """
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    # Use UUID in global_symbol to ensure unique cache key per test run
+    unique_id = uuid.uuid4().hex[:8]
+    M, N = 1024, 1024
+
+    @T.prim_func
+    def vector_add(
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
+    ):
+        with T.Kernel(M, threads=256) as bx:
+            for i in T.serial(N):
+                C[bx, i] = A[bx, i] + B[bx, i]
+
+    kernel_func = vector_add.with_attr("global_symbol", f"vector_add_{backend}_{unique_id}")
+
+    # === Pass 1: Cache miss (memory cache already cleared by fixture) ===
+    kernel1 = tilelang.compile(
+        kernel_func,
+        out_idx=[2],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 1, f"Cache miss: postproc should be called once, got {counter.count}"
+
+    source1 = kernel1.get_kernel_source()
+    assert counter.marker in source1, f"Expected marker '{counter.marker}' in kernel source"
+
+    # Verify cache files created
+    cache_files = list(Path(clean_cache_env).rglob("*.*"))
+    assert len(cache_files) > 0, "Cache files should be created, found none"
+
+    # === Pass 2: Cache hit (clear memory cache to force disk read) ===
+    _dispatch_map[backend]._memory_cache.clear()
+
+    kernel2 = tilelang.compile(
+        kernel_func,
+        out_idx=[2],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 1, f"Cache hit: postproc should not be called again, got {counter.count} calls"
+
+    source2 = kernel2.get_kernel_source()
+    assert counter.marker in source2, f"Expected cached marker '{counter.marker}' in source"
+
+    # === Verify functional correctness ===
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    c1 = kernel1(a, b)
+    c2 = kernel2(a, b)
+    ref = a + b
+
+    torch.testing.assert_close(c1, ref)
+    torch.testing.assert_close(c2, ref)
+    torch.testing.assert_close(c1, c2)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_cache_miss_detection(clean_cache_env, backend):
+    """Verify cache correctly misses when function changes.
+
+    This ensures our testing method is valid - different functions should
+    produce different cache keys and trigger recompilation.
+    """
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    M, N = 512, 512
+
+    # Kernel 1: A + 1.0
+    @T.prim_func
+    def func1(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(M, threads=128) as bx:
+            for i in T.serial(N):
+                B[bx, i] = A[bx, i] + 1.0
+
+    unique_id_1 = uuid.uuid4().hex[:8]
+    kernel_func1 = func1.with_attr("global_symbol", f"func1_{backend}_{unique_id_1}")
+
+    tilelang.compile(
+        kernel_func1,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+    assert counter.count == 1, f"First kernel: expected 1 call, got {counter.count}"
+
+    # Kernel 2: A + 2.0 (different implementation)
+    @T.prim_func
+    def func2(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(M, threads=128) as bx:
+            for i in T.serial(N):
+                B[bx, i] = A[bx, i] + 2.0  # Different!
+
+    unique_id_2 = uuid.uuid4().hex[:8]
+    kernel_func2 = func2.with_attr("global_symbol", f"func2_{backend}_{unique_id_2}")
+
+    tilelang.compile(
+        kernel_func2,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 2, f"Different function should cause cache miss, expected 2 calls, got {counter.count}"
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_cache_isolation_between_tests(clean_cache_env, backend):
+    """Verify cache isolation between tests.
+
+    Ensures clean_cache_env fixture provides independent cache directory for each test.
+    """
+    # Verify cache directory is empty
+    cache_files = list(Path(clean_cache_env).rglob("*"))
+    assert all(f.is_dir() for f in cache_files), f"Cache should be empty, found: {cache_files}"
+
+    # Compile a kernel
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    unique_id = uuid.uuid4().hex[:8]
+
+    @T.prim_func
+    def simple(A: T.Tensor((128,), T.float32), B: T.Tensor((128,), T.float32)):
+        with T.Kernel(128, threads=128) as i:
+            B[i] = A[i] * 2.0
+
+    kernel_func = simple.with_attr("global_symbol", f"simple_{backend}_{unique_id}")
+
+    tilelang.compile(
+        kernel_func,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    # Should be cache miss (empty cache dir)
+    assert counter.count == 1, f"Expected cache miss, got count={counter.count}"
+
+    # Verify cache files created
+    cache_files_after = list(Path(clean_cache_env).rglob("*.*"))
+    assert len(cache_files_after) > 0, f"Cache files should be created, found: {cache_files_after}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
new file mode 100644
index 000000000..dd58ca259
--- /dev/null
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -0,0 +1,79 @@
+import tilelang.testing
+from tilelang.carver.arch.driver.cuda_driver import (
+    get_cuda_device_properties,
+    get_device_name,
+    get_shared_memory_per_block,
+    get_device_attribute,
+    get_max_dynamic_shared_size_bytes,
+    get_persisting_l2_cache_max_size,
+    get_num_sms,
+    get_registers_per_block,
+)
+import torch
+
+
+class _cudaDeviceAttrNames:
+    r"""
+    This struct carries all properties that are of int32_t.
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxSharedMemoryPerBlock: int = 8
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMultiProcessorCount: int = 16
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+@tilelang.testing.requires_cuda
+def test_driver_get_device_properties():
+    prop = get_cuda_device_properties()
+    assert prop is not None, "Failed to get CUDA device properties"
+    assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_device_name():
+    tl_device_name = get_device_name()
+    th_device_name = torch.cuda.get_device_name()
+    assert tl_device_name == th_device_name, "Device names do not match"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_shared_memory_per_block():
+    tl_smem = get_shared_memory_per_block()
+    driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
+    assert tl_smem == driver_smem, "Shared memory per block values do not match"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_persisting_l2_cache_size():
+    tl_cache_size = get_persisting_l2_cache_max_size()
+    driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
+    assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_num_sms():
+    tl_num_sms = get_num_sms()
+    driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
+    assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_registers_per_block():
+    tl_regs_per_block = get_registers_per_block()
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
+    assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
+
+
+@tilelang.testing.requires_cuda
+def test_device_get_max_dynamic_shared_size_bytes():
+    tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
+    driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
+    assert tl_dynamic_smem == driver_dynamic_smem, "Max dynamic shared size bytes values do not match"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index 43cdb27e3..ea674f7c7 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -3,22 +3,20 @@
 from tilelang.carver.roller import PrimFuncNode, OutputNode, Edge
 from tilelang.carver.arch import auto_infer_current_arch
 from tvm import te
+from tilelang.language import dtypes as T
 
 
 def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
@@ -29,8 +27,7 @@ def gemm(M, N, K):
 
     tensorized_func, tags = carver.utils.get_tensorized_func_and_tags(func, arch.target)
     print(tags)
-    policy = carver.TensorCorePolicy.from_prim_func(
-        func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
+    policy = carver.TensorCorePolicy.from_prim_func(func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
 
     hints = policy.emit_config(topk=topk)
 
@@ -59,16 +56,13 @@ def run_general_matmul_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index fee46761f..a096ec3b2 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -1,13 +1,11 @@
 import tilelang.testing
 from tilelang import carver
+from tilelang.language import dtypes as T
 from tilelang.carver.arch import auto_infer_current_arch
 from typing import List
 
 
-def run_general_reduction_recommend_hints(structure: str = "SSR",
-                                          shape: List[int] = None,
-                                          dtype: str = "float16",
-                                          topk: int = 20):
+def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.GeneralReductionTemplate(
         structure=structure,
@@ -23,14 +21,12 @@ def run_general_reduction_recommend_hints(structure: str = "SSR",
 
 
 def test_general_reduction_recommend_hints():
-    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SS", [1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], "float16")
+    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SS", [1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], T.float16)
 
 
-def run_elementwise_recommend_hints(shape: List[int] = None,
-                                    dtype: str = "float16",
-                                    topk: int = 20):
+def run_elementwise_recommend_hints(shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.ElementwiseTemplate(
         shape=shape,
@@ -45,18 +41,18 @@ def run_elementwise_recommend_hints(shape: List[int] = None,
 
 
 def test_elementwise_recommend_hints():
-    run_elementwise_recommend_hints([1024, 1024], "float16")
-    run_elementwise_recommend_hints([1024], "float16")
-    run_elementwise_recommend_hints([1024, 1024, 1024], "float16")
+    run_elementwise_recommend_hints([1024, 1024], T.float16)
+    run_elementwise_recommend_hints([1024], T.float16)
+    run_elementwise_recommend_hints([1024, 1024, 1024], T.float16)
 
 
 def run_matmul_recommend_hints(
     M: int = 1024,
     N: int = 1024,
     K: int = 1024,
-    in_dtype: str = "float16",
-    out_dtype: str = "float16",
-    accum_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.MatmulTemplate(
@@ -76,16 +72,14 @@ def run_matmul_recommend_hints(
 
 
 def test_matmul_recommend_hints():
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float16", "float16")
-    run_matmul_recommend_hints(1024, 1024, 1024, "int8", "int32", "int32")
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float32", "float16")
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float16, T.float16)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.int8, T.int32, T.int32)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float32, T.float16)
 
 
-def run_gemv_recommend_hints(N: int = 1024,
-                             K: int = 1024,
-                             in_dtype: str = "float16",
-                             out_dtype: str = "float16",
-                             accum_dtype: str = "float16"):
+def run_gemv_recommend_hints(
+    N: int = 1024, K: int = 1024, in_dtype: T.dtype = T.float16, out_dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float16
+):
     arch = auto_infer_current_arch()
     carve_template = carver.GEMVTemplate(
         N=N,
@@ -103,9 +97,9 @@ def run_gemv_recommend_hints(N: int = 1024,
 
 
 def test_gemv_recommend_hints():
-    run_gemv_recommend_hints(1024, 1024, "float16", "float16", "float16")
-    run_gemv_recommend_hints(1024, 1024, "int8", "int32", "int32")
-    run_gemv_recommend_hints(1024, 1024, "float16", "float32", "float16")
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float16, T.float16)
+    run_gemv_recommend_hints(1024, 1024, T.int8, T.int32, T.int32)
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float32, T.float16)
 
 
 def run_fmha_recommend_hints(
@@ -114,9 +108,9 @@ def run_fmha_recommend_hints(
     seq_length: int = 512,
     seq_kv_length: int = 512,
     head_dim: int = 128,
-    in_dtype: str = "float16",
-    accum_dtype: str = "float16",
-    out_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.FlashAttentionTemplate(
@@ -139,9 +133,10 @@ def run_fmha_recommend_hints(
     assert len(hints) > 0, "Hints length should be greater than 0"
 
 
+@tilelang.testing.requires_cuda
 def test_fmha_recommend_hints():
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "float16", "float16", "float16")
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "int8", "int32", "int32")
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_cuda_restrict_codegen.py b/testing/python/components/test_cuda_restrict_codegen.py
new file mode 100644
index 000000000..bff8b3b19
--- /dev/null
+++ b/testing/python/components/test_cuda_restrict_codegen.py
@@ -0,0 +1,48 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _get_sig_line(code: str) -> str:
+    # Find the kernel signature line in generated CUDA code
+    for line in code.splitlines():
+        line = line.strip()
+        if line.startswith('extern "C" __global__ void'):
+            return line
+    raise AssertionError("Kernel signature not found in generated code")
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_default_has_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        with T.Kernel(N, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    artifact = tilelang.lower(kernel, target="cuda")
+    sig = _get_sig_line(artifact.kernel_source)
+    # By default, kNoAlias is set and both pointers are restrict-qualified
+    assert "__restrict__" in sig
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_annotation_removes_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel_body_annot(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        # Explicitly mark buffers that may alias as non-restrict
+        with T.Kernel(N, threads=32) as pid:
+            T.annotate_restrict_buffers(x, y)
+            y[pid] = x[pid] + 1.0
+
+    art1 = tilelang.lower(kernel_body_annot, target="cuda")
+    sig1 = _get_sig_line(art1.kernel_source)
+    # No parameter should be emitted with __restrict__
+    assert "__restrict__" not in sig1
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 1d60708fe..3dcd7f57c 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -1,6 +1,9 @@
 import tilelang
 import tilelang.testing
 from tilelang import language as T
+from tilelang.utils.target import check_hip_availability
+
+_IS_HIP_AVAILABLE = check_hip_availability()
 
 
 @tilelang.jit
@@ -8,12 +11,12 @@ def _compile_kernel_without_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
@@ -23,17 +26,18 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_STORAGE_REWRITE_DETECT_INPLACE: True,
-    },)
+    },
+)
 def _compile_kernel_with_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
@@ -53,8 +57,9 @@ def test_storage_rewrite_detect_inplace_toggle():
     script_off = _get_device_kernel_script(detect_inplace=False)
     script_on = _get_device_kernel_script(detect_inplace=True)
 
-    assert script_off.count("read = (read * 2);") == 0
-    assert script_on.count("read = (read * 2);") > 0
+    pattern = "read[0] = (read[0] * 2);" if _IS_HIP_AVAILABLE else "read = (read * 2);"
+    assert script_off.count(pattern) == 0
+    assert script_on.count(pattern) > 0
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index 499f3346b..d599e581a 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -1,5 +1,6 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -88,12 +87,11 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
@@ -113,9 +111,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -128,9 +126,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 0129b3731..4113c9d06 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -5,14 +5,14 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 0
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
             A_local = T.alloc_local((block_M, block_K), dtype)
@@ -31,7 +31,6 @@ def matmul(
             # )
 
             for ko in T.Pipelined(K // block_K, num_stages=num_stages):
-
                 T.copy(A[by * block_M, ko * block_K], A_local)
 
                 # Or Copy with Parallel
@@ -62,14 +61,13 @@ def test_matmul_codegen():
 
 
 def test_matmul_compile():
-
-    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
         # a simple kernel just for jit test
         @T.prim_func
         def matmul(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
                 A_local = T.alloc_local((block_M, block_K), dtype)
@@ -103,9 +101,9 @@ def matmul(
     block_M, block_N, block_K = M // 4, N // 4, K // 4
     cpu_func = matmul_jit_test(M, N, K, block_M, block_N, block_K)
     with tvm.target.Target("c"):
-        complied_fun = tilelang.compile(cpu_func, -1, execution_backend="ctypes")
+        complied_fun = tilelang.compile(cpu_func, -1, execution_backend="cython")
 
-    in_dtype = "float16"
+    in_dtype = T.float16
     A = torch.randn(M, K, dtype=torch.__getattribute__(in_dtype))
     B = torch.randn(K, N, dtype=torch.__getattribute__(in_dtype))
 
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
index 1602c30c7..4ed72903e 100644
--- a/testing/python/debug/test_device_assert.py
+++ b/testing/python/debug/test_device_assert.py
@@ -7,27 +7,25 @@
 # TODO(dyq) It intentionally triggers a device-side assert so we can't include this in CI
 # Please run manually when you want to verify that device_assert actually traps on GPU.
 def _manual_device_assert_triggered():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
             tid = T.get_thread_binding()
             T.device_assert(tid > 0, "Assertion Trigger !")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 def test_device_assert_no_trigger():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
             tid = T.get_thread_binding()
             T.device_assert(tid == tid)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 1bc761619..3025ddfb9 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -1,31 +1,43 @@
 # type: ignore
-
+import pytest
 import tilelang
 import tilelang.testing
 import tilelang.language as T
 
 
-def debug_print_buffer(M=16, N=16, dtype="float16"):
-
+def debug_print_buffer(M=16, N=16, dtype=T.float16):
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
             shared_buf = T.alloc_shared([M, N], dtype)
             T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
-def test_debug_print_buffer():
-    debug_print_buffer(16, 16, dtype="float")
-    debug_print_buffer(16, 16, dtype="float16")
-    debug_print_buffer(16, 16, dtype="uint8")
+@pytest.mark.parametrize(
+    "dtype", [T.int8, T.int16, T.int32, T.int64, T.uint8, T.uint16, T.uint32, T.uint64, T.float16, T.float32, T.float64, T.bfloat16]
+)
+def test_debug_print_buffer(dtype):
+    debug_print_buffer(dtype=dtype)
+
+
+@tilelang.testing.requires_cuda
+def test_debug_print_buffer_cuda_fp8():
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e5m2)
+
+
+@tilelang.testing.requires_rocm
+def test_debug_print_buffer_rocm_fp8():
+    debug_print_buffer(dtype=T.float8_e4m3fnuz)
+    debug_print_buffer(dtype=T.float8_e5m2fnuz)
 
 
 def debug_print_buffer_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -35,7 +47,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if bx == 0 and by == 0 and bz == 0:
                 T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -45,7 +57,7 @@ def test_debug_print_buffer_conditional():
 
 
 def debug_print_value_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -54,7 +66,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if tid == 0:
                 T.print(bx + by + bz)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -64,7 +76,7 @@ def test_debug_print_value_conditional():
 
 
 def debug_print_register_files(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -73,7 +85,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             for i, j in T.Parallel(M, N):
                 T.print(register_buf[i, j])
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -82,23 +94,27 @@ def test_debug_print_register_files():
     debug_print_register_files(16, 16)
 
 
-def debug_print_msg(M=16, N=16):
-    dtype = "float16"
+def debug_print_msg(M=16, N=16, msg_only=False):
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
             tid = T.get_thread_binding()
             if tid == 0:
-                T.print(bx + by + bz, msg="hello world")
+                if msg_only:
+                    T.print(msg="hello world")
+                else:
+                    T.print(bx + by + bz, msg="hello world")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 def test_debug_print_msg():
-    debug_print_msg(16, 16)
+    debug_print_msg(16, 16, msg_only=True)
+    debug_print_msg(16, 16, msg_only=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
deleted file mode 100644
index 07f4d7847..000000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ /dev/null
@@ -1,518 +0,0 @@
-import torch
-import torch.backends
-from tilelang import tvm as tvm
-import tilelang.testing
-from tvm import DataType
-import tilelang.language as T
-from tilelang.intrinsics.utils import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (TensorCoreIntrinEmitter)
-
-tilelang.testing.set_random_seed(0)
-
-
-def make_swizzle_layout(shared_buf):
-    dtype = shared_buf.dtype
-    shape = shared_buf.shape
-
-    can_swizzle = shape[-1] * DataType(dtype).bits == 512
-    if not can_swizzle:
-        return T.Layout(shape, lambda *args: args)
-
-    def transform_func(i, j):
-        new_warp_i, new_warp_j = get_swizzle_layout(i, j, shape[-1], dtype)
-        return [new_warp_i, new_warp_j]
-
-    return T.Layout(shape, transform_func)
-
-
-def tl_matmul_macro(
-    N,
-    K,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-):
-    assert in_dtype in [
-        "float16",
-        "int8",
-    ], "Currently only float16 and int8 are supported"
-    assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
-    ], "Currently only float16, float32 and int32 are supported"
-
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    if out_dtype == "int32":
-        micro_size_k = 32
-
-    # This is a debug config
-    block_row_warps = 1
-    block_col_warps = 1
-    warp_row_tiles = 16
-    warp_col_tiles = 16
-    chunk = 32 if in_dtype == "float16" else 64
-    shared_scope = "shared.dyn"
-
-    # Pipeline Stage
-    stage = 2
-
-    block_M = block_row_warps * warp_row_tiles
-    block_N = block_col_warps * warp_col_tiles
-    block_K = chunk
-
-    M = tvm.te.var("m")
-    A_shape = (M, K)
-    B_shape = (N, K)
-    A_shared_shape = (block_M, block_K)
-    B_shared_shape = (block_N, block_K)
-    C_shared_shape = (
-        block_M // micro_size_x,
-        block_N // micro_size_y,
-        micro_size_x,
-        micro_size_y,
-    )
-
-    warp_size = 32
-    threads = warp_size * (block_row_warps * block_col_warps)
-    local_size = (micro_size_x * micro_size_y) // warp_size
-    warp_rows = warp_row_tiles // micro_size_x
-    warp_cols = warp_col_tiles // micro_size_y
-
-    # MMA Wrapper to Auto Generate Code for MMA
-    mma_emitter = TensorCoreIntrinEmitter(
-        a_dtype=in_dtype,
-        b_dtype=in_dtype,
-        accum_dtype=accum_dtype,
-        a_transposed=False,
-        b_transposed=True,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-    )
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
-            A_local = T.alloc_local((warp_rows * local_size), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size), in_dtype)
-            C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
-
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
-
-            # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined((K // block_K), num_stages=stage):
-
-                # Load A into shared memory
-                for i, k in T.Parallel(block_M, block_K):
-                    A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
-
-                # Load B into shared memory
-                for j, k in T.Parallel(block_N, block_K):
-                    B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
-
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
-                    # Load A into fragment
-                    mma_emitter.ldmatrix_a(
-                        A_local,
-                        A_shared,
-                        ki,
-                    )
-
-                    # Load B into fragment
-                    mma_emitter.ldmatrix_b(
-                        B_local,
-                        B_shared,
-                        ki,
-                    )
-
-                    # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local)
-
-            # Perform STMatrix
-            mma_emitter.stmatrix(
-                C_local,
-                C_shared,
-            )
-
-            # Store shared into global
-            for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i, bx * block_N + j] = C_shared[
-                    i // micro_size_x,
-                    j // micro_size_y,
-                    i % micro_size_x,
-                    j % micro_size_y,
-                ]
-
-    return main
-
-
-def assert_tl_matmul_macro_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
-    matmul = tl_matmul_macro(N, K, in_dtype, out_dtype, accum_dtype)
-
-    kernel = tilelang.compile(matmul, out_idx=[2])
-    src_code = kernel.get_kernel_source()
-
-    # src_code is the generated cuda source
-    assert src_code is not None
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
-
-    C = kernel(A, B)
-
-    # Get Reference Result
-    ref_c = torch.matmul(A, B.T).to(getattr(torch, accum_dtype))
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, out_idx=[2])
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    C = kernel(A, B)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block_all_dynamic(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_all_dynamic_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-    dynamic_alignment=8,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    pass_configs = {
-        tilelang.PassConfigKey.TL_DISABLE_DYNAMIC_TAIL_SPLIT: dynamic_alignment != 0,
-        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment
-    }
-    if M % 64 == 0 or N % 64 == 0 or K % 64 != 0:
-        # workaround for hopper tma lower pass
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER] = True
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED] = True
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def test_assert_tl_matmul_macro():
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(66, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(32, 128, 128, "float16", "float16", "float16")
-
-
-def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16",
-                                                   "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4)
-    # Tail split is enabled with dynamic alignment 0
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0)
-
-
-if __name__ == "__main__":
-    # tilelang.testing.main()
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
deleted file mode 100644
index b5ccbda92..000000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ /dev/null
@@ -1,556 +0,0 @@
-import torch
-from tilelang import tvm as tvm
-import tilelang.testing
-import tilelang.language as T
-
-
-def tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_static(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        accum_dtype,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    # print(kernel.get_kernel_source())
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench()
-    print(f"Static Latency: {latency} ms")
-
-
-def tl_matmul_block_dynamic_m(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_m(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_m(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic M Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mn(
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mn(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mn(
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MN Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mnk(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mnk(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mnk(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MNK Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16",
-                                  "float16", "float32")
-
-
-def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 4
-        })
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
-
-
-def test_all():
-    run_assert_tl_matmul_block_static(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_m(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mn(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mnk(1024, 1024, 1024, 128, 128, 32)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index c3b5d1b52..e181eb4df 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -1,3 +1,4 @@
+import pytest
 import tilelang
 import tilelang.language as T
 import torch
@@ -7,16 +8,16 @@
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +28,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +50,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +58,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +72,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +86,22 @@ def main(
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +110,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +120,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +134,7 @@ def main(
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +160,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +173,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +189,19 @@ def main(
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +210,19 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -259,13 +243,9 @@ def main(
     print(f"✓ {mathop_name} numerical test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_mathops_generate_no_fastmath():
-    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
-    # Based on test results, our tl.* intrinsics actually generate
-    # no fastmath versions
-    # This appears to be the intended behavior
-    single_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("exp", T.exp),
         ("exp2", T.exp2),
         ("exp10", T.exp10),
@@ -287,24 +267,26 @@ def test_mathops_generate_no_fastmath():
         ("trunc", T.trunc),
         ("round", T.round),
         ("nearbyint", T.nearbyint),
-    ]
-
-    for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_mathops_generate_no_fastmath(name, func):
+    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
+    run_single_arg_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_two_arg_mathops_fastmath():
-    """Test all two-argument mathops"""
-    # Two argument mathops
-    two_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("pow", T.pow),
         ("fmod", T.fmod),
-    ]
-
-    for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_two_arg_mathops_fastmath(name, func):
+    """Test all two-argument mathops"""
+    run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -313,11 +295,9 @@ def test_abs_maps_to_fabs():
     run_abs_test()
 
 
-@tilelang.testing.requires_cuda
-def test_fastmath_versions():
-    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    # Test fastmath versions
-    fastmath_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("__exp", T.__exp),
         ("__exp10", T.__exp10),
         ("__log", T.__log),
@@ -326,11 +306,13 @@ def test_fastmath_versions():
         ("__tan", T.__tan),
         ("__cos", T.__cos),
         ("__sin", T.__sin),
-    ]
-
-    for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_fastmath_versions(name, func):
+    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
+    run_fastmath_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/ir/test_ir_kernel_frame.py b/testing/python/ir/test_ir_kernel_frame.py
deleted file mode 100644
index c3a6bbc90..000000000
--- a/testing/python/ir/test_ir_kernel_frame.py
+++ /dev/null
@@ -1 +0,0 @@
-# TODO: implement this test for tilelang/language/kernel.py
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
new file mode 100644
index 000000000..d6a9ffe26
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -0,0 +1,35 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def _cumsum_view_infer_layout(hidden):
+    num_tokens = T.dynamic("num_tokens")
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
+        with T.Kernel(num_tokens, threads=128) as pid:
+            smem = T.alloc_shared((hidden,), dtype=T.float32)
+            T.copy(x[pid, :], smem)
+            T.cumsum(T.view(smem, (1, hidden)), dim=1)
+
+    return buggy_kernel
+
+
+@tilelang.testing.requires_cuda
+def test_cumsum_view_infer_layout():
+    hidden = 128
+    x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
+    kernel = _cumsum_view_infer_layout(hidden)
+    kernel(x)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index 395593d8c..1b25e203c 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -8,12 +8,13 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_static_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
             T.fill(x[0:128], 0)
 
@@ -24,30 +25,33 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_dynamic_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
-            a, b = T.alloc_var('int'), T.alloc_var('int')
+            a, b = T.alloc_var(T.int), T.alloc_var(T.int)
             T.fill(x[a:b], 0)
 
     return buggy_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_static_region_kernel():
     kernel = _fill_with_static_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_dynamic_region_kernel():
     kernel = _fill_with_dynamic_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1026.py b/testing/python/issue/test_tilelang_issue_1026.py
new file mode 100644
index 000000000..07bd86292
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1026.py
@@ -0,0 +1,26 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit
+def get_shared_kernel():
+    @T.prim_func
+    def shared_kernel():
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            shared_mem = T.alloc_shared((32), dtype="float32", scope="shared")
+            if tx % 2 == 0:
+                a = shared_mem[tx]
+                shared_mem[tx ^ 1] = a
+
+    return shared_kernel
+
+
+def test_issue_1026():
+    kernel = get_shared_kernel()
+    assert "__syncthreads" not in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1106.py b/testing/python/issue/test_tilelang_issue_1106.py
new file mode 100644
index 000000000..f41450c0c
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1106.py
@@ -0,0 +1,45 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def get_kernel(m: int):
+    dtype = "int32"
+
+    @T.prim_func
+    def test_kernel(a: T.Tensor[(m,), dtype], b: T.Tensor[(m,), dtype]):
+        with T.Kernel(1, threads=64) as (bx):
+            shared = T.alloc_shared((64,), dtype)
+            tx = T.get_thread_binding(0)
+            tid = tx + bx * 64
+
+            for i in T.serial((m // 2 - tx) // 64 + 1):
+                for j in T.vectorized(2):
+                    shared[tx] += a[(i * 64 + tid) * 2 + j]
+
+            b[tid] = shared[tx]
+
+    return test_kernel
+
+
+def test_issue_1106():
+    m = 200
+    kernel = get_kernel(m)
+    source = kernel.get_kernel_source()
+    # Ensure __syncthreads is not inside the for loop
+    for_start = source.find("for (int i = 0;")
+    for_end = source.find("__syncthreads")
+    assert for_end > for_start, "__syncthreads should be after the for loop, not inside it"
+    # Check that __syncthreads appears after the closing brace of the outer for loop
+    assert source[for_end - 4 : for_end - 2] == "}\n", "__syncthreads should not be inside any for loop"
+
+
+if __name__ == "__main__":
+    # tilelang.testing.main()
+    test_issue_1106()
diff --git a/testing/python/issue/test_tilelang_issue_1115.py b/testing/python/issue/test_tilelang_issue_1115.py
index 176986235..658c126a0 100644
--- a/testing/python/issue/test_tilelang_issue_1115.py
+++ b/testing/python/issue/test_tilelang_issue_1115.py
@@ -4,25 +4,23 @@
 
 
 def test_int64_address():
-
     @tilelang.jit
     def set_cache_kernel(
         S,
         D,
-        pos_ty='int64',
-        dtype="float32",
+        pos_ty="int64",
+        dtype=T.float32,
     ):
-
         @T.prim_func
         def main(
-                pos: T
-            .Tensor(
+            pos: T.Tensor(
                 [
                     S,
-                ], pos_ty
+                ],
+                pos_ty,
             ),  # type: ignore  `TypeError: Check failed: (a.dtype() == b.dtype()) is false: mismatched types. int64 vs. int32`
-                value: T.Tensor([S, D], dtype),  # type: ignore
-                cache: T.Tensor([S, D], dtype),  # type: ignore
+            value: T.Tensor([S, D], dtype),  # type: ignore
+            cache: T.Tensor([S, D], dtype),  # type: ignore
         ):
             with T.Kernel(S, threads=128) as bx:
                 slot = pos[bx]
@@ -34,11 +32,11 @@ def main(
     D = 2
     S = 10
     cache = torch.rand((S, D), device="cuda", dtype=torch.float32)
-    value = torch.rand((S, D), device='cuda', dtype=torch.float32)
-    pos_int64 = torch.arange(S, device='cuda', dtype=torch.int64)
-    pos_int32 = torch.arange(S, device='cuda', dtype=torch.int32)
-    kernel_int64 = set_cache_kernel(S, D, 'int64')
-    kernel_int32 = set_cache_kernel(S, D, 'int32')
+    value = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    pos_int64 = torch.arange(S, device="cuda", dtype=torch.int64)
+    pos_int32 = torch.arange(S, device="cuda", dtype=torch.int32)
+    kernel_int64 = set_cache_kernel(S, D, "int64")
+    kernel_int32 = set_cache_kernel(S, D, T.int32)
     kernel_int64(pos_int64, value, cache)
     torch.testing.assert_close(cache, value)
     kernel_int32(pos_int32, value, cache)
diff --git a/testing/python/issue/test_tilelang_issue_1198.py b/testing/python/issue/test_tilelang_issue_1198.py
index eb9ed4596..e6330e435 100644
--- a/testing/python/issue/test_tilelang_issue_1198.py
+++ b/testing/python/issue/test_tilelang_issue_1198.py
@@ -3,13 +3,17 @@
 
 
 def test_issue_1198():
-
     @T.prim_func
-    def foo(x: T.Buffer([
-        32,
-    ], "int32")):
+    def foo(
+        x: T.Buffer(
+            [
+                32,
+            ],
+            T.int32,
+        ),
+    ):
         pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1210.py b/testing/python/issue/test_tilelang_issue_1210.py
index 971fb8193..aa0ce2f0d 100644
--- a/testing/python/issue/test_tilelang_issue_1210.py
+++ b/testing/python/issue/test_tilelang_issue_1210.py
@@ -4,10 +4,10 @@
 
 
 def _make_kernel(M, N):
-    dtype = "bfloat16"
+    dtype = T.bfloat16
 
     @T.prim_func
-    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), "int32")):
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
         with T.Kernel(4, threads=1):
             A = T.alloc_shared([N], dtype)
             B = T.alloc_shared([N], dtype)
@@ -22,14 +22,35 @@ def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), "int32")):
     return fwd_main
 
 
+def _make_kernel_if_cond(M, N):
+    dtype = T.bfloat16
+
+    @T.prim_func
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            # Regression for a bug where InjectSoftwarePipeline left the loop
+            # variable as a free var, causing MakePackedAPI to fail
+            for i in T.Pipelined(4, num_stages=1):
+                if i > 1:
+                    _id = ids[i]
+                    T.copy(KV[_id, :], A)
+                    T.clear(B)
+
+    return fwd_main
+
+
 def test_make_packed_api_no_free_loop_var():
-    func = _make_kernel(4, 4)
+    func, func_if_cond = _make_kernel(4, 4), _make_kernel_if_cond(4, 4)
     # Keep warp-specialization/TMA disabled to match the original repro
     cfg = {
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
     }
     tilelang.compile(func, pass_configs=cfg)
+    tilelang.compile(func_if_cond, pass_configs=cfg)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/issue/test_tilelang_issue_1237.py b/testing/python/issue/test_tilelang_issue_1237.py
new file mode 100644
index 000000000..bb936e468
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1237.py
@@ -0,0 +1,23 @@
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1237_dynamic_copy_extent_builds():
+    # Repro from debug/1113_issues/copy_dyn.py, adapted as a unit test.
+    # The goal is to ensure T.copy correctly handles dynamic extents
+    # (e.g., src slice length vs. static dst buffer size) during prim_func building.
+
+    length = T.symbolic("len", dtype=T.int32)
+
+    @T.prim_func
+    def sample_kernel(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
+        with T.Kernel(1, threads=32):
+            buffer_shared = T.alloc_shared((1024,), dtype=T.int32)
+            T.copy(global_tensor[0:length], buffer_shared)
+
+    # Building the prim_func is sufficient to exercise the bug path; no need to JIT/execute.
+    _ = sample_kernel
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1263.py b/testing/python/issue/test_tilelang_issue_1263.py
new file mode 100644
index 000000000..418500a8f
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1263.py
@@ -0,0 +1,70 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def _test_kernel(M, N):
+    dtype = "bfloat16"
+
+    @T.prim_func
+    def fwd_main(
+        KV: T.Tensor((M, N), dtype),
+        ids: T.Tensor((4,), "int32"),
+        ids2: T.Tensor((4,), "int32"),
+    ):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            for i in T.Pipelined(4, num_stages=1):
+                id = ids[i]
+                id2 = ids2[id]
+                T.copy(KV[id2, :], A)
+                T.clear(B)
+
+    return fwd_main
+
+
+def _test_kernel_if_cond(M, N):
+    dtype = "bfloat16"
+
+    @T.prim_func
+    def fwd_main(
+        KV: T.Tensor((M, N), dtype),
+        ids: T.Tensor((4,), "int32"),
+        ids2: T.Tensor((4,), "int32"),
+    ):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            for i in T.Pipelined(4, num_stages=1):
+                id = ids[i]
+                id2 = ids2[id]
+                if id2 > 1:
+                    T.copy(KV[id2, :], A)
+                    T.clear(B)
+
+    return fwd_main
+
+
+def test_issue_1263_pipeline_no_consumer():
+    tilelang.compile(_test_kernel(1024, 1024))
+    tilelang.compile(
+        _test_kernel(1024, 1024),
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        },
+    )
+    tilelang.compile(_test_kernel_if_cond(1024, 1024))
+    tilelang.compile(
+        _test_kernel_if_cond(1024, 1024),
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1374.py b/testing/python/issue/test_tilelang_issue_1374.py
new file mode 100644
index 000000000..f5c1c3905
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1374.py
@@ -0,0 +1,30 @@
+import tilelang as tl
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1374_non_var_itermark():
+    @tl.jit
+    def get_wrong_kernel(M: int = 4096):
+        dtype = "int32"
+        num_threads = 128
+
+        @T.prim_func
+        def main(A: T.Tensor((16, 14), dtype=dtype), B: T.Tensor((16, 448), dtype=dtype)):
+            with T.Kernel(1, threads=num_threads) as (bx,):
+                A_local = T.alloc_fragment((16, 14), dtype=dtype)
+                B_local = T.alloc_fragment((16, 448), dtype=dtype)
+
+                T.copy(A, A_local)
+                T.copy(B, B_local)
+                for i, j in T.Parallel(16, 448):
+                    B[i, j] += A_local[i, j // 32]
+
+        return main
+
+    kernel = get_wrong_kernel()
+    print(kernel.get_kernel_source())
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1549.py b/testing/python/issue/test_tilelang_issue_1549.py
new file mode 100644
index 000000000..69d486ea5
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1549.py
@@ -0,0 +1,44 @@
+import tilelang as tl
+import tilelang.testing
+import tilelang.language as T
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1549_strange_var_vectorization():
+    @tl.jit
+    def get_wrong_kernel(M: int = 4096):
+        dtype = "int32"
+        num_threads = 64
+
+        @T.prim_func
+        def main(
+            Data: T.Tensor((M,), dtype),
+        ):
+            with T.Kernel(1, threads=num_threads) as _:
+                # Pre-allocated scalar variables (causes issue in 0.1.7.post1)
+                idx = T.alloc_var(T.int32)
+                for i in T.Parallel(M):
+                    idx = i
+                    Data[i] = idx
+
+        return main
+
+    kernel = get_wrong_kernel()
+    M = 2048
+    kernel = get_wrong_kernel(M)
+    data = torch.randint(0, 100, (M,), dtype=torch.int32, device="cuda")
+    kernel(data)
+    code = kernel.get_kernel_source()
+    print(code)
+    assert (
+        """for (int i = 0; i < 32; ++i) {
+    idx = ((i * 64) + ((int)threadIdx.x));
+    Data[((i * 64) + ((int)threadIdx.x))] = idx;
+  }"""
+        in code
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1601.py b/testing/python/issue/test_tilelang_issue_1601.py
new file mode 100644
index 000000000..762a13f4a
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1601.py
@@ -0,0 +1,25 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1601():
+    @tilelang.jit
+    def qwq():
+        @T.prim_func
+        def main(
+            A: T.Tensor((8,), T.float8_e4m3fn),
+        ):
+            with T.Kernel(1, threads=32):
+                for i in T.vectorized(8):
+                    A[i] = 0
+
+        return main
+
+    kernel = qwq()
+    assert "fp8_e4_t broadcast_var = fp8_e4_t(0x0p+0f/*0.000000e+00*/);" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1604.py b/testing/python/issue/test_tilelang_issue_1604.py
new file mode 100644
index 000000000..215d479c7
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1604.py
@@ -0,0 +1,36 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import re
+
+
+@tilelang.jit
+def qwq():
+    dtype = "float32"
+
+    @T.prim_func
+    def main(out: T.Tensor[(512,), dtype]):
+        with T.Kernel(1, threads=512):
+            A = T.alloc_shared((32,), dtype)
+            B = T.alloc_shared((32,), dtype)
+
+            tid = T.get_thread_binding()
+            if tid < 32:
+                A[tid] = tid
+                B[tid] = tid
+
+            out[tid] = A[tid % 32]
+
+    return main
+
+
+def test_issue_1604():
+    kernel = qwq()
+    print(kernel.get_kernel_source())
+    target = "__syncthreads"
+    pattern = r"if [^{]*{[^}]*\b" + re.escape(target) + r"\b[^}]*}"
+    assert len(re.findall(pattern, kernel.get_kernel_source())) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1678.py b/testing/python/issue/test_tilelang_issue_1678.py
new file mode 100644
index 000000000..d22cc414a
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1678.py
@@ -0,0 +1,26 @@
+# ruff: noqa
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1678():
+    @tilelang.jit
+    def qwq():
+        @T.prim_func
+        def qwq_kernel():
+            with T.Kernel(4096, 1, threads=1) as (pid_y, pid_x):
+                i = T.alloc_var("int32")
+                i = 1
+                tmp_row = T.alloc_local((4,), "float32")
+                amax_local = T.alloc_var("float32")
+                j = 0
+                amax_local = T.max(amax_local, tmp_row[j])
+
+        return qwq_kernel
+
+    kernel = qwq()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1690.py b/testing/python/issue/test_tilelang_issue_1690.py
new file mode 100644
index 000000000..832240538
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1690.py
@@ -0,0 +1,21 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1690():
+    @tilelang.jit()
+    def test(A):
+        N = T.const("N")
+        A: T.Tensor[[N], T.float32]
+        with T.Kernel():
+            tmp = T.alloc_fragment((N,), T.float32)
+            tmp_max = T.alloc_fragment(1, T.float32)
+            T.copy(A, tmp)
+            T.reduce_max(tmp, tmp_max, dim=0)
+
+    test.compile(N=16)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1697.py b/testing/python/issue/test_tilelang_issue_1697.py
new file mode 100644
index 000000000..9350272c9
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1697.py
@@ -0,0 +1,46 @@
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+
+
+def matmu_jit_kernel(M, N, K, block_M, block_N, block_K):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), T.float16)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_local = T.alloc_fragment((block_M, block_N), T.float32)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K)):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(M, N, K, block_M, block_N, block_K):
+    program = matmu_jit_kernel(M, N, K, block_M, block_N, block_K)
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    A = torch.randn(M, K, dtype=torch.float16).cuda()
+    B = torch.randn(K, N, dtype=torch.float16).cuda()
+
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, torch.matmul(A, B), atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel_zero_dim():
+    run_gemm_jit_kernel(512, 1024, 0, 128, 256, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1719.py b/testing/python/issue/test_tilelang_issue_1719.py
new file mode 100644
index 000000000..3d4dbc98f
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1719.py
@@ -0,0 +1,127 @@
+import tilelang
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1719_layout_1():
+    @tilelang.jit
+    def _buggy_kernel():
+        with T.Kernel(threads=32):
+            tmp1 = T.alloc_shared([32, 32], T.float16)
+            tmp2 = T.alloc_shared([32, 32], T.float16)
+            tmp3 = T.alloc_fragment([32, 32], T.float32)
+            tmp4 = T.alloc_fragment([32], T.float32)
+            T.gemm(tmp1, tmp2, tmp3, transpose_B=True)
+            T.reduce_max(tmp3, tmp4)
+            for i in T.Parallel(32):
+                tmp4[i] = 1
+
+    kernel = _buggy_kernel.compile()
+    print(kernel.get_kernel_source())
+
+
+def test_issue_1719_layout_2():
+    @tilelang.jit
+    def _buggy_kernel(M: int, N: int):
+        with T.Kernel():
+            tmp1 = T.alloc_fragment((N, M), T.float32)
+            tmp2 = T.alloc_fragment((N, M), T.float32)
+            tmp3 = T.alloc_fragment((N, M, M), T.float32)
+            for i, j, k in T.Parallel(N, M, M):
+                tmp3[i, j, k] = 1
+            T.reduce_sum(tmp3, tmp2, dim=1)
+            for i, k in T.Parallel(N, M):
+                tmp2[i, k] /= tmp1[i, k]
+
+    kernel = _buggy_kernel.compile(M=4, N=32)
+    print(kernel.get_kernel_source())
+    assert "tmp2[(((int)threadIdx.x) & 3)]" not in kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1719_layout_3():
+    @tilelang.jit
+    def _buggy_kernel(A, dtype=T.float32):
+        M, N = T.const("M, N")
+        A: T.Tensor[(M, N), dtype]
+        B = T.empty((M,), dtype)
+        with T.Kernel(1, threads=32) as _:
+            A_local = T.alloc_fragment((M, N), dtype)
+            B_local = T.alloc_fragment((M,), dtype)
+
+            T.copy(A, A_local)
+            T.reduce_sum(A_local, B_local, dim=1)
+            T.copy(B_local, B)
+        return B
+
+    M, N = 2, 128
+    kernel = _buggy_kernel.compile(M=M, N=N)
+    a = torch.randn(M, N, device="cuda")
+    b = kernel(a)
+    print(b, a.sum(dim=1))
+    torch.testing.assert_close(b, a.sum(dim=1), atol=1e-2, rtol=1e-2)
+
+
+def test_issue_1719_layout_4():
+    @tilelang.jit
+    def _buggy_kernel():
+        with T.Kernel(threads=128):
+            Q_tail_shared = T.alloc_shared([32, 32], T.bfloat16)
+            K_tail_shared = T.alloc_shared([32, 32], T.bfloat16)
+            acc_s = T.alloc_fragment([32, 32], T.float32)
+            m_i = T.alloc_fragment([32], T.float32)
+            T.gemm(Q_tail_shared, K_tail_shared, acc_s, transpose_B=True)
+            T.reduce_max(acc_s, m_i)
+
+    _buggy_kernel.compile()
+
+
+def test_issue_1719_layout_5():
+    @tilelang.jit
+    def buggy_kernel(A, dtype=T.float32):
+        N = T.const("N")
+        A: T.Tensor[(1, N), dtype]
+        with T.Kernel(1, threads=32) as _:
+            A_local = T.alloc_fragment((1, N), dtype)
+            B_local = T.alloc_fragment((1,), dtype)
+
+            T.copy(A, A_local)
+            T.reduce_sum(A_local, B_local, dim=1)
+
+    buggy_kernel.compile(N=128)
+
+
+def test_issue_1719_layout_6():
+    @tilelang.jit
+    def buggy_kernel():
+        with T.Kernel():
+            tmp1 = T.alloc_fragment((1,), dtype=T.float32)
+            tmp2 = T.alloc_fragment((1,), dtype=T.float32)
+            tmp1[0] = 1
+            T.reduce_sum(tmp1, tmp2)
+            tmp2[0]
+
+    buggy_kernel.compile()
+
+
+def test_issue_1719_layout_7():
+    @tilelang.jit
+    def buggy_kernel():
+        with T.Kernel(threads=32):
+            tmp1 = T.alloc_fragment([1, 32], T.float16)
+            tmp2 = T.alloc_fragment([32], T.float32)
+            tmp3 = T.alloc_fragment([32], T.float32)
+            tmp4 = T.alloc_fragment([32], T.float32)
+            T.reduce_max(tmp1, tmp4, dim=0)
+            k = 0
+            T.copy(tmp1[k, :], tmp2)
+            for i in T.Parallel(32):
+                tmp3[i] += tmp2[i] - tmp4[i]
+
+    buggy_kernel.compile()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1728.py b/testing/python/issue/test_tilelang_issue_1728.py
new file mode 100644
index 000000000..30d7ee0fa
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1728.py
@@ -0,0 +1,25 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1728():
+    @tilelang.jit()
+    def get_qwq(hidden: int):
+        num_tokens = T.dynamic("num_tokens")
+        num_sms = num_tokens
+
+        @T.prim_func
+        def qwq(A: T.Tensor[(num_tokens,)]):
+            with T.Kernel(num_sms) as sm_id:
+                stop = sm_id + 1
+                for block_idx in T.serial(sm_id, stop):
+                    _pid_x, _pid_y = (block_idx, hidden)
+
+        return qwq
+
+    get_qwq(1)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1734.py b/testing/python/issue/test_tilelang_issue_1734.py
new file mode 100644
index 000000000..219a14185
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1734.py
@@ -0,0 +1,43 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1734():
+    """Test that loop-invariant if statements are hoisted out of loops."""
+
+    @tilelang.jit()
+    def kernel():
+        @T.prim_func
+        def main(
+            A: T.Tensor[(2, 512), T.float32],
+            B: T.Tensor[(2, 512), T.float32],
+            C: T.Tensor[(2,), T.float32],
+        ):
+            with T.Kernel(1, threads=256):
+                A_local = T.alloc_fragment((2, 512), T.float32)
+                B_local = T.alloc_fragment((2, 512), T.float32)
+                C_local = T.alloc_fragment((2,), T.float32)
+
+                T.copy(A, A_local)
+                T.copy(C, C_local)
+
+                for i, j in T.Parallel(2, 512):
+                    if C_local[i] >= 0:
+                        B_local[i, j] = A_local[i, j]
+
+                T.copy(B_local, B)
+
+        return main
+
+    mod = kernel.compile()
+    source = mod.get_kernel_source()
+    # Verify that the if statement is hoisted outside the for loop
+    # After hoisting, we should see "if" before "for" pattern
+    if_pos = source.find("if (")
+    for_pos = source.find("for (")
+    assert if_pos < for_pos, "Loop-invariant if should be hoisted outside the loop"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1744.py b/testing/python/issue/test_tilelang_issue_1744.py
new file mode 100644
index 000000000..5610f5b0f
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1744.py
@@ -0,0 +1,37 @@
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def _buggy_kernel(S: T.Tensor((8), T.bfloat16), D: T.Tensor((4, 64), T.bfloat16)):
+    with T.Kernel(1, threads=128):
+        S_shared = T.alloc_shared((8), T.bfloat16)
+        S_fragment = T.alloc_fragment((8), T.float32)
+        D_shared = T.alloc_shared((4, 64), T.bfloat16)
+
+        T.copy(S, S_shared)
+        T.copy(S_shared, S_fragment)
+        for k in T.serial(64):
+            for i in T.Parallel(4):
+                D_shared[i, k] = S_fragment[i]
+        T.copy(D_shared, D)
+
+
+@tilelang.testing.requires_cuda
+def test():
+    test_S = torch.randn((8), dtype=torch.bfloat16, device="cuda")
+    test_D = torch.zeros((4, 64), dtype=torch.bfloat16, device="cuda")
+    _buggy_kernel(test_S, test_D)
+    ref_D = test_S[:4].view(4, 1).repeat(1, 64)
+    torch.testing.assert_close(test_D, ref_D)
+
+
+if __name__ == "__main__":
+    test()
diff --git a/testing/python/issue/test_tilelang_issue_814.py b/testing/python/issue/test_tilelang_issue_814.py
index 1a9e63d29..f9f94bd74 100644
--- a/testing/python/issue/test_tilelang_issue_814.py
+++ b/testing/python/issue/test_tilelang_issue_814.py
@@ -5,12 +5,11 @@
 
 
 @tilelang.jit
-def _tmp_var_kernel(N, block_N, dtype="float"):
-
+def _tmp_var_kernel(N, block_N, dtype=T.float32):
     @T.prim_func
     def kernel(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:
             for i in T.Parallel(block_N):
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index ab5937122..7edb5dbd1 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -8,7 +8,6 @@
 
 @tilelang.jit
 def _empty_kernel():
-
     @T.prim_func
     def empty_kernel():
         with T.Kernel(1, threads=32) as thread_idx:
@@ -17,7 +16,15 @@ def empty_kernel():
     return empty_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_lowering():
+    # Ensure a valid CUDA runtime context is current on this thread for the
+    # target device before using driver API calls. Without this, calls like
+    # cuModuleLoadData can fail with CUDA_ERROR_INVALID_CONTEXT, especially
+    # for kernels that don't touch any device memory or streams beforehand
+    # (e.g., "empty" kernels) and therefore haven't triggered context
+    # creation implicitly.
+    torch.cuda.set_device(0)
     kernel = _empty_kernel()
     kernel()
 
@@ -27,14 +34,13 @@ def _empty_with_dead_code_kernel():
     num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float32"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
         with T.Kernel(num_tokens, threads=32) as pid:
             y = x[pid]
 
     return buggy_kernel
 
 
-@tilelang.testing.requires_cuda
 def test_empty_with_dead_code_kernel():
     kernel = _empty_with_dead_code_kernel()
     x = torch.randn((128,), dtype=torch.float32, device="cuda")
@@ -43,7 +49,6 @@ def test_empty_with_dead_code_kernel():
 
 @tilelang.jit
 def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
-
     @T.prim_func
     def kernel_with_tuple_kernel_binding():
         with T.Kernel(1, threads=32) as (pid,):
@@ -59,7 +64,9 @@ def kernel_with_scalar_kernel_binding():
     return kernel_with_tuple_kernel_binding if use_tuple_binding else kernel_with_scalar_kernel_binding
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_with_binding_variants():
+    torch.cuda.set_device(0)
     kernel = _empty_kernel_with_binding_variants()
     kernel()
 
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index e42ebb59e..db86e825e 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -4,19 +4,17 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -39,7 +37,7 @@ def main(
 
 def run_gemm_pipeline_test(N, block_M=128, block_N=128, block_K=32):
     func = matmul(N, N, N, block_M, block_N, block_K)
-    jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda")
+    jit_kernel = tilelang.compile(func, out_idx=[2])
 
     torch.manual_seed(0)
     a = torch.randn(N, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/issue/test_tilelang_issue_merge_if.py b/testing/python/issue/test_tilelang_issue_merge_if.py
index 1db7f337c..e3b1e3082 100644
--- a/testing/python/issue/test_tilelang_issue_merge_if.py
+++ b/testing/python/issue/test_tilelang_issue_merge_if.py
@@ -6,13 +6,12 @@
 
 
 def merge_if_test():
-
     @T.prim_func
     def main():
-        A = T.alloc_fragment((1,), "float16")
-        B = T.alloc_fragment((1,), "float16")
-        C = T.alloc_fragment((1,), "float16")
-        D = T.alloc_fragment((1,), "float16")
+        A = T.alloc_fragment((1,), T.float16)
+        B = T.alloc_fragment((1,), T.float16)
+        C = T.alloc_fragment((1,), T.float16)
+        D = T.alloc_fragment((1,), T.float16)
         if A[0] == 0:
             A[0] = 0
         if B[0] == 0:
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index d5aa00a4d..752657be0 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,8 +1,9 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
-from tilelang.engine.callback import register_cuda_postproc_callback
+from tilelang.engine.callback import register_cuda_postproc_callback, register_hip_postproc_callback
 import torch
+import pytest
 
 
 def matmul(
@@ -25,13 +26,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -91,23 +90,31 @@ def tilelang_callback_cuda_postproc(code, _):
         code = f"// {stramp}\n" + code
         return code
 
+    @register_hip_postproc_callback
+    def tilelang_callback_hip_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
+    tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
+    tilelang.enable_cache()
 
     kernel_source = matmul_kernel.get_kernel_source()
 
     assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
 
 
-def test_gemm_f16f16f16_nn():
+@pytest.mark.skip(reason="Skipping callback test")
+def test_cuda_postproc_callback():
     run_gemm(
         512,
         1024,
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -135,13 +142,11 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -205,7 +210,6 @@ def run_gemm_jit_kernel(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -223,9 +227,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
similarity index 58%
rename from testing/python/jit/test_tilelang_jit_gemm_ctypes.py
rename to testing/python/jit/test_tilelang_jit_cutedsl.py
index fd5243f00..de0595074 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -52,75 +52,60 @@ def main(
     return main
 
 
-def run_gemm(
+def matmul_jit_kernel(
     M,
     N,
     K,
+    block_M,
+    block_N,
+    block_K,
     trans_A,
     trans_B,
     in_dtype,
     out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
+    accum_dtype,
+    num_stages,
+    threads,
 ):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
 
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
+    return main
 
 
-def matmu_jit_kernel(
+def matmul_kernel_with_barrier(
     M,
     N,
     K,
     block_M,
     block_N,
     block_K,
+    mbars,
     trans_A,
     trans_B,
     in_dtype,
@@ -134,15 +119,14 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            barriers = T.alloc_barrier(mbars)  # noqa: F841
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -177,7 +161,7 @@ def run_gemm_jit_kernel(
     num_stages=3,
     num_threads=128,
 ):
-    program = matmu_jit_kernel(
+    program = matmul_jit_kernel(
         M,
         N,
         K,
@@ -193,7 +177,7 @@ def run_gemm_jit_kernel(
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
 
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
@@ -208,6 +192,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -218,6 +203,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
         512,
@@ -235,19 +221,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_ctypes_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cutedsl_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -264,14 +240,14 @@ def run_ctypes_kernel_do_bench(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
 
     profiler = matmul_kernel.get_profiler()
 
-    ctypes_latency = profiler.do_bench(func=matmul_kernel)
-    print(f"Ctypes Latency: {ctypes_latency} ms")
+    cutedsl_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"CuTeDSL Latency: {cutedsl_latency} ms")
 
-    assert ctypes_latency is not None
+    assert cutedsl_latency is not None
 
     tvm_latency = profiler.do_bench()
     print(f"TVM Latency: {tvm_latency} ms")
@@ -279,24 +255,14 @@ def run_ctypes_kernel_do_bench(M,
     assert tvm_latency is not None
 
 
-def test_ctypes_kernel_do_bench():
-    run_ctypes_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_ctypes_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+@tilelang.testing.requires_cuda
+def test_cutedsl_kernel_do_bench():
+    run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -313,7 +279,7 @@ def run_ctypes_kernel_multi_stream(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
@@ -332,24 +298,14 @@ def run_ctypes_kernel_multi_stream(M,
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
-def test_ctypes_kernel_multi_stream():
-    run_ctypes_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_ctypes_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+@tilelang.testing.requires_cuda
+def test_cutedsl_kernel_multi_stream():
+    run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -366,7 +322,7 @@ def run_ctypes_dynamic_shape(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     if isinstance(M, T.Var):
         M = 1024
     if isinstance(N, T.Var):
@@ -389,23 +345,72 @@ def run_ctypes_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+@tilelang.testing.requires_cuda
+def test_cutedsl_dynamic_shape():
+    run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+    )
+
+
+def run_cutedsl_barrier(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    mbars,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    program = matmul_kernel_with_barrier(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        mbars,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        threads,
+    )
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
+
+    assert f"barriers = tl.alloc_smem(cutlass.Uint64, size_in_elems={len(mbars)})" in matmul_kernel.get_kernel_source()
+    for i, arrive_count in enumerate(mbars):
+        assert f"tl.mbarrier_init(barriers[{i}], {arrive_count})" in matmul_kernel.get_kernel_source()
 
 
-def test_ctypes_dynamic_shape():
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+@tilelang.testing.requires_cuda
+def test_cutedsl_barrier():
+    mbars = (1, 1, 128, 128)
+    run_cutedsl_barrier(512, 1024, 768, 128, 256, 32, mbars, False, False, "float16", "float16", "float16", 2, 128)
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    test_gemm_f16f16f16_nn()
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_cython.py b/testing/python/jit/test_tilelang_jit_cython.py
new file mode 100644
index 000000000..07b369bea
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_cython.py
@@ -0,0 +1,61 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_cython_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="cython")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 25c19a058..9d65714a9 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 import torch
@@ -27,13 +27,11 @@ def matmul_kernel_jit(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -95,7 +93,6 @@ def run_gemm_kernel_jit(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -106,16 +103,16 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
-def test_gemm_f16f16f16_nn_kernel_jit():
+def test_gemm_f16f16f32_nn_kernel_jit():
     run_gemm_kernel_jit(
         512,
         1024,
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 12524f129..220a40f0a 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -52,68 +52,6 @@ def main(
     return main
 
 
-def run_gemm(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="cython")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
-
-
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
 def matmu_jit_kernel(
     M,
     N,
@@ -138,9 +76,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -208,6 +146,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -225,9 +164,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -235,19 +174,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_cython_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cython_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -265,45 +194,26 @@ def run_cython_kernel_do_bench(M,
     )
 
     cython_matmul_kernel = tilelang.compile(program, execution_backend="cython")
-    ctypes_matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
 
     cython_profiler = cython_matmul_kernel.get_profiler()
-    ctypes_profiler = ctypes_matmul_kernel.get_profiler()
 
     cython_latency = cython_profiler.do_bench(func=cython_matmul_kernel)
     print(f"cython Latency: {cython_latency} ms")
 
-    # assert ctypes_latency is not None
-
     tvm_latency = cython_profiler.do_bench()
     print(f"TVM Latency: {tvm_latency} ms")
 
     assert tvm_latency is not None
-
-    ctypes_latency = ctypes_profiler.do_bench(func=ctypes_matmul_kernel)
-    print(f"ctypes Latency: {ctypes_latency} ms")
-
     assert cython_latency is not None
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_cython_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+
+def run_cython_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -342,23 +252,12 @@ def run_cython_kernel_multi_stream(M,
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -398,36 +297,20 @@ def run_cython_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape_with_out_idx(M,
-                                          N,
-                                          K,
-                                          trans_A,
-                                          trans_B,
-                                          in_dtype,
-                                          out_dtype,
-                                          dtypeAccum,
-                                          block_M,
-                                          block_N,
-                                          block_K,
-                                          num_stages=3,
-                                          num_threads=128):
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape_with_out_idx(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -467,13 +350,11 @@ def run_cython_dynamic_shape_with_out_idx(M,
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
 
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def matmul_int_variable(
@@ -498,10 +379,10 @@ def matmul_int_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.int32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.int32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -525,10 +406,10 @@ def main(
     return main
 
 
-def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                            out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                  out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_int_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -544,8 +425,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                            "float32", 0, 128)
+    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 def matmul_float_variable(
@@ -570,10 +450,10 @@ def matmul_float_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.float32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.float32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -597,10 +477,10 @@ def main(
     return main
 
 
-def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                              out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                    out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_float_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -616,8 +496,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                              "float32", 0, 128)
+    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index 6241ea90c..a9edb5e93 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -7,57 +7,13 @@
 
 
 @tl.jit
-def ptr_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-        a_ptr: T.ptr,
-        b_ptr: T.ptr,
-        c_ptr: T.ptr,
-        bias_ptr: T.ptr,
-        m: T.int32,
-        n: T.int32,
-        k: T.int32,
-        with_bias: T.bool,
-    ):
-        A = T.make_tensor(a_ptr, (m, k), dtype)
-        B = T.make_tensor(b_ptr, (k, n), dtype)
-        C = T.make_tensor(c_ptr, (m, n), accum_dtype)
-        Bias = T.make_tensor(bias_ptr, (n), accum_dtype)
-
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_N, block_K), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(k, block_K), num_stages=3):
-                # Copy tile of A
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[bx * block_N, ko * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
-
-            if with_bias:
-                for i, j in T.Parallel(block_M, block_N):
-                    C_local[i, j] += Bias[bx * block_N + j]
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-@tl.jit
-def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, with_bias=False):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), accum_dtype),
-            Bias: T.Tensor((N), accum_dtype),
-            with_bias: T.bool,
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), accum_dtype),
+        Bias: T.Tensor((N), accum_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -82,30 +38,12 @@ def main(
     return main
 
 
-def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    func = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-
+def run_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
-    d = torch.randn(N, device="cuda", dtype=map_torch_type(accum_dtype))
-
-    func(a, b, c, None, M, N, K, False)
-
-    ref_no_bias = (a @ b.T).to(map_torch_type(accum_dtype))
-    ref_with_bias = ref_no_bias + d
-
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-
-    func(a, b, c, d, M, N, K, True)
-
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
-
-    func = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    func(a, b, c, None, False)
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-    func(a, b, c, d, True)
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
+    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
+    kernel(a, b, c, None)
 
 
 def test_nullptr():
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
new file mode 100644
index 000000000..1ccd2c5c3
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -0,0 +1,494 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+@tilelang.testing.requires_cuda
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_nvrtc_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+
+    profiler = matmul_kernel.get_profiler()
+
+    nvrtc_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"NVRTC Latency: {nvrtc_latency} ms")
+
+    assert nvrtc_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+@tilelang.testing.requires_cuda
+def test_nvrtc_kernel_do_bench():
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+@tilelang.testing.requires_cuda
+def test_nvrtc_kernel_multi_stream():
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+@tilelang.testing.requires_cuda
+def test_nvrtc_dynamic_shape():
+    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_nvrtc_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in NVRTC backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+@tilelang.testing.requires_cuda
+def test_nvrtc_im2col_tma_desc():
+    """Test im2col TMA descriptor with NVRTC backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_nvrtc_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+@tilelang.testing.requires_cuda
+def test_nvrtc_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="nvrtc")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_nvrtc_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="nvrtc")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index e7bcec412..bcc76f3e5 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -1,6 +1,7 @@
 import tilelang.testing
 import tilelang
 import torch
+from tilelang import language as T
 
 
 @tilelang.jit(
@@ -16,9 +17,9 @@ def matmul_kernel_jit(
     block_K,
     trans_A=False,
     trans_B=True,
-    in_dtype='float16',
-    out_dtype='float32',
-    accum_dtype='float32',
+    in_dtype=T.float16,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
     num_stages=2,
     threads=128,
 ):
@@ -31,9 +32,9 @@ def matmul_kernel_jit(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -57,8 +58,8 @@ def main(
 
 def test_par_compile():
     configs = [
-        (1024, 1024, 1024, 128, 128, 32),
-        (2048, 2048, 2048, 256, 256, 64),
+        (1024, 1024, 1024, 128, 128, 64),
+        (2048, 2048, 2048, 256, 256, 32),
         (4096, 4096, 4096, 64, 64, 128),
     ]
     kernels = matmul_kernel_jit.par_compile(configs)
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
new file mode 100644
index 000000000..4b8e99764
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -0,0 +1,499 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float32,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_tvm_ffi_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+
+    profiler = matmul_kernel.get_profiler()
+
+    tvm_ffi_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"tvm_ffi Latency: {tvm_ffi_latency} ms")
+
+    assert tvm_ffi_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_tvm_ffi_kernel_do_bench():
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_tvm_ffi_kernel_multi_stream():
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_dynamic_shape():
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2
+    )
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_tvm_ffi_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in tvm_ffi backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_im2col_tma_desc():
+    """Test im2col TMA descriptor with tvm_ffi backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_tvm_ffi_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+@tilelang.testing.requires_cuda
+def test_tvm_ffi_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="tvm_ffi")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    source = kernel.get_host_source()
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
+    )
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+    )
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tvm_ffi_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index b4509fadc..97d050b73 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,7 +221,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
index 3ec6ae030..501b38fda 100644
--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
@@ -1,5 +1,5 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 import torch
 
 
@@ -12,19 +12,17 @@ def elementwise_add(
     out_dtype,
     threads,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), in_dtype),
-            B: T.Tensor((M, N), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, N), in_dtype),
+        B: T.Tensor((M, N), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             start_x = bx * block_N
             start_y = by * block_M
 
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 y = start_y + local_y
                 x = start_x + local_x
 
@@ -67,8 +65,8 @@ def test_elementwise_add_f32():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
         128,
         256,
     )
@@ -78,8 +76,8 @@ def test_elementwise_add_f16():
     run_elementwise_add(
         512,
         1024,
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
         128,
         256,
     )
@@ -89,8 +87,8 @@ def test_elementwise_add_i32():
     run_elementwise_add(
         512,
         1024,
-        "int32",
-        "int32",
+        T.int32,
+        T.int32,
         128,
         256,
     )
@@ -100,8 +98,8 @@ def test_elementwise_add_f32f16():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float16",
+        T.float32,
+        T.float16,
         128,
         256,
     )
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 19f327d66..276083b26 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -12,12 +12,11 @@ def calc_diff(x, y):
 
 
 def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((N, K), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=128) as (bx, by):
             A_shared = T.alloc_shared((bM, bK), in_dtype)
@@ -44,8 +43,7 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 
     C = kernel(A, B)
 
-    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)),
-                         B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
+    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)), B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
     print(C)
     print(ref_c)
     diff = calc_diff(C, ref_c)
@@ -56,8 +54,8 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9)
 def test_assert_matmul():
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e4m3", "float32", "float32")
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e5m2", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e4m3fn, T.float32, T.float32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 34def174d..9ba369b6b 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,21 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -60,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -105,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +123,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +136,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +145,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,8 +221,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index afd01f337..1a7a5e460 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def main(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,8 +164,8 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 5dcde1d5e..f6a412f14 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -92,11 +91,11 @@ def ref_program(A, B):
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -104,6 +103,7 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nn():
     run_gemm(
         512,
@@ -111,9 +111,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -128,9 +128,9 @@ def test_gemm_f16f16f32_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         128,
         32,
@@ -144,9 +144,9 @@ def test_gemm_bf16bf16f32_nn():
         768,
         False,
         False,
-        "bfloat16",
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.bfloat16,
+        T.float32,
         128,
         128,
         32,
@@ -160,15 +160,16 @@ def test_gemm_f32f32f32_nn():
         768,
         False,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_tn():
     run_gemm(
         512,
@@ -176,9 +177,9 @@ def test_gemm_f16f16f16_tn():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -186,6 +187,7 @@ def test_gemm_f16f16f16_tn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nt():
     run_gemm(
         512,
@@ -193,9 +195,9 @@ def test_gemm_f16f16f16_nt():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -204,15 +206,16 @@ def test_gemm_f16f16f16_nt():
 
 
 def test_gemm_i8i8i32_nt():
-    run_gemm(512, 1024, 768, False, True, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, False, True, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_i8i8i32_tn():
-    run_gemm(512, 1024, 768, True, False, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f64f64f64_nt():
-    run_gemm(512, 512, 512, False, True, "float64", "float64", "float64", 64, 32, 16)
+    run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
 
 def test_gemm_f32f32f32_nt():
@@ -222,15 +225,17 @@ def test_gemm_f32f32f32_nt():
         768,
         False,
         True,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
     )
 
 
+# TODO(Gong): Meets precision issue on ROCm, disable for now
+@tilelang.testing.requires_cuda
 def test_gemm_f32f32f32_tn():
     run_gemm(
         512,
@@ -238,15 +243,16 @@ def test_gemm_f32f32f32_tn():
         768,
         True,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_aligned_f16f16f16_nn():
     run_gemm(
         512 - 8,
@@ -254,9 +260,9 @@ def test_pad_aligned_f16f16f16_nn():
         768 - 24,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -264,6 +270,7 @@ def test_pad_aligned_f16f16f16_nn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_f16f16f16_nn():
     run_gemm(
         512 - 9,
@@ -271,9 +278,9 @@ def test_pad_f16f16f16_nn():
         768 - 5,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -288,9 +295,9 @@ def test_pad_f16f16f32_nn():
         768 + 15,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         64,
         32,
@@ -321,9 +328,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -407,9 +414,9 @@ def test_gemm_f16f16f16_sr():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -441,9 +448,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
@@ -526,9 +533,9 @@ def test_gemm_f16f16f16_rs():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index da2e12cdc..dd1b75ebc 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -214,22 +219,22 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul_fp8():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 548497c72..5c52f432d 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -35,13 +35,13 @@ def tl_matmul_simt(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     # This is a debug config
@@ -72,16 +72,15 @@ def tl_matmul_simt(
 
     micro_size_k = 128 // DataType(in_dtype).bits
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
 
@@ -97,7 +96,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.serial(K // block_K):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -109,29 +107,24 @@ def main(
                 for ki in T.serial((block_K // micro_size_k)):
                     for i in T.serial(local_size_a):
                         for mk in T.vectorized(micro_size_k):
-                            A_local[i, mk] = A_shared[warp_m * local_size_a + i,
-                                                      ki * micro_size_k + mk]
+                            A_local[i, mk] = A_shared[warp_m * local_size_a + i, ki * micro_size_k + mk]
 
                     for i in T.serial(local_size_b):
                         for mk in T.vectorized(micro_size_k):
-                            B_local[i, mk] = B_shared[warp_n * local_size_b + i,
-                                                      ki * micro_size_k + mk]
+                            B_local[i, mk] = B_shared[warp_n * local_size_b + i, ki * micro_size_k + mk]
 
                     for i, j in T.grid(local_size_a, local_size_b):
                         for mk in T.serial(micro_size_k // dp4a_size):
                             if use_dp4a:
-                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size],
-                                       C_local[i * local_size_b + j])
+                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size], C_local[i * local_size_b + j])
                             else:
                                 for dp4a_idx in T.serial(dp4a_size):
-                                    C_local[i * local_size_b +
-                                            j] += A_local[i, mk * dp4a_size +
-                                                          dp4a_idx] * B_local[j, mk * dp4a_size +
-                                                                              dp4a_idx]
+                                    C_local[i * local_size_b + j] += (
+                                        A_local[i, mk * dp4a_size + dp4a_idx] * B_local[j, mk * dp4a_size + dp4a_idx]
+                                    )
 
             for i, j in T.grid(local_size_a, local_size_b):
-                C[by * block_M + warp_m * local_size_a + i,
-                  bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
+                C[by * block_M + warp_m * local_size_a + i, bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
 
     return main
 
@@ -146,7 +139,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -161,16 +154,20 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     assert latency is not None
 
     # Get Reference Result
-    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, accum_dtype))
+    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     print(C)
     print(ref_c)
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+
+
+@tilelang.testing.requires_cuda
+def test_assert_tl_matmul_int8():
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index bbc2e79e2..1f7660032 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -4,13 +4,12 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -59,7 +58,8 @@ def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create random input tensors on the GPU
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
     b = torch.randn(K, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 86d6acbda..d211488cd 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def main(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,15 +164,15 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float16", "float16", "float16", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "int8", "int32", "int32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float16, T.float16, T.float16, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.int8, T.int32, T.int32, with_bias=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt_fp8():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 5cdd67105..535e98329 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -4,18 +4,17 @@
 import tilelang.testing
 import tilelang.language as T
 from tilelang.intrinsics import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
-
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
+from tilelang.transform import simplify_prim_func
 from tilelang.intrinsics.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
     INT4TensorCoreIntrinEmitterWithLadderTransform,
 )
-from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(42)
 
 
-# @simplify_prim_func
 def tl_matmul(
     M,
     N,
@@ -25,20 +24,20 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     K = K // 2
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -46,7 +45,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -91,12 +90,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -104,10 +102,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -115,7 +115,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -125,7 +124,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -168,7 +166,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
         out_idx=[2],
         pass_configs={
             tilelang.PassConfigKey.TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS: True,
-        })
+        },
+    )
     print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
@@ -195,9 +194,10 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_assert_tl_matmul_correctness():
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(128, 128, 64, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(128, 128, 64, T.int8, T.int32, T.int32)
 
 
 @simplify_prim_func
@@ -211,18 +211,18 @@ def tl_matmul_weight_only_transform(
 ):
     K = K // 2
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     transform_b = 3
@@ -232,7 +232,7 @@ def tl_matmul_weight_only_transform(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -285,12 +285,11 @@ def tl_matmul_weight_only_transform(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -298,10 +297,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -309,19 +310,15 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
 
                 # Load B into shared memory
-                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k,
-                                               micro_size_y, micro_size_k):
-                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j,
-                                               ko * (block_K // micro_size_k) + k, jj, kk]
+                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k, micro_size_y, micro_size_k):
+                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j, ko * (block_K // micro_size_k) + k, jj, kk]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -359,6 +356,7 @@ def main(
 
 def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     import bitblas
+
     matmul = tl_matmul_weight_only_transform(M, N, K, in_dtype, out_dtype, accum_dtype)
     kernel = tilelang.compile(matmul, out_idx=[2])
     profiler = kernel.get_profiler()
@@ -376,8 +374,8 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
         M=N,
         N=(K // 2),
-        datatype="int8",
-        storage_dtype="int8",
+        datatype=T.int8,
+        storage_dtype=T.int8,
         transform_kind=transform_b,
         transpose_matrix=True,
     )
@@ -400,10 +398,11 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
 
 @tilelang.testing.requires_package("bitblas")
 @tilelang.testing.requires_llvm
+@tilelang.testing.requires_cuda
 def test_assert_tl_matmul_weight_only_transform():
-    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py b/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py
new file mode 100644
index 000000000..a51c1c884
--- /dev/null
+++ b/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py
@@ -0,0 +1,83 @@
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=256) as (bx, by):
+            A_shared = T.alloc_shared((bM, bK), in_dtype)
+            B_shared = T.alloc_shared((bN, bK), in_dtype)
+            C_tmem = T.alloc_tmem((bM, bN), accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((bM, bN), accum_dtype)
+            C_shared = T.alloc_shared((bM, bN), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, bK), num_stages=2):
+                T.copy(A[by * bM, k * bK], A_shared)
+                T.copy(B[bx * bN, k * bK], B_shared)
+                T.gemm(A_shared, B_shared, C_tmem, transpose_B=True, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * bM, bx * bN])
+
+    return main
+
+
+def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype):
+    func = matmul_nt(M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype)
+    kernel = tilelang.compile(
+        func,
+        out_idx=-1,
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+    assert out_dtype in [T.int32], "Currently only int32 is supported"
+    assert accum_dtype in [T.int32], "Currently only int32 is supported"
+
+    if in_dtype is T.int8:
+        A = torch.randint(-128, 128, (M, K), device="cuda", dtype=torch.int8)
+        B = torch.randint(-128, 128, (N, K), device="cuda", dtype=torch.int8)
+    elif in_dtype is T.uint8:
+        A = torch.randint(0, 256, (M, K), device="cuda", dtype=torch.uint8)
+        B = torch.randint(0, 256, (N, K), device="cuda", dtype=torch.uint8)
+    else:
+        raise ValueError(f"Unsupported input dtype: {in_dtype}")
+
+    C = kernel(A, B)
+
+    ref_c = (A.float() @ B.T.float()).to(torch.int32)
+    print(C)
+    print(ref_c)
+    diff = calc_diff(C, ref_c)
+    print(f"diff: {diff}")
+    assert diff < 1e-3
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_assert_matmul():
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 128, T.int8, T.int32, T.int32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 128, T.uint8, T.int32, T.int32)
+
+
+if __name__ == "__main__":
+    test_assert_matmul()
diff --git a/testing/python/language/test_tilelang_intimm.py b/testing/python/language/test_tilelang_intimm.py
new file mode 100644
index 000000000..46c2c7987
--- /dev/null
+++ b/testing/python/language/test_tilelang_intimm.py
@@ -0,0 +1,28 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_tilelang_intimm():
+    T.int32(0x7FFFFFFF)
+    T.int32(-0x7FFFFFFF - 1)
+    T.uint32(0xFFFFFFFF)
+    T.int64(0x7FFFFFFFFFFFFFFF)
+    T.int64(-0x7FFFFFFFFFFFFFFF - 1)
+    T.uint64(0xFFFFFFFFFFFFFFFF)
+
+    a = T.int32()
+    a & 0x7FFFFFFF
+
+    a = T.uint32()
+    a & 0xFFFFFFFF
+
+    a = T.int64()
+    a & 0x7FFFFFFFFFFFFFFF
+
+    a = T.uint64()
+    a & T.uint64(0xFFFFFFFFFFFFFFFF)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index c99d36102..77e1a60d2 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -44,9 +43,9 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(program, out_idx=[2], target="cuda")
+    kernel = tilelang.compile(program, out_idx=[2])
     kernel.run_once()
 
 
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index 73233ec87..db694d337 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.all(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 202d6bfaa..709796932 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -1,4 +1,5 @@
 import tilelang.testing
+from tilelang import language as T
 
 
 def alloc_var(
@@ -6,12 +7,10 @@ def alloc_var(
     block_N,
     dtype,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -34,11 +33,11 @@ def run_alloc_var(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var():
-    run_alloc_var(1024, 128, "float16")
+    run_alloc_var(1024, 128, T.float16)
 
 
 def alloc_var_add(
@@ -50,8 +49,8 @@ def alloc_var_add(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -74,11 +73,11 @@ def run_alloc_var_add(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var_add():
-    run_alloc_var_add(1024, 128, "float16")
+    run_alloc_var_add(1024, 128, T.float16)
 
 
 def alloc_var_with_initializer(
@@ -91,8 +90,8 @@ def alloc_var_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp = T.alloc_var(dtype, init_value)
@@ -113,12 +112,13 @@ def run_alloc_var_with_initializer(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    print(code)
     assert f"= {init_value};" in code
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_var_with_initializer():
-    run_alloc_var_with_initializer(256, 64, "int32", 5)
+    run_alloc_var_with_initializer(256, 64, T.int32, 5)
 
 
 def alloc_multi_vars_with_initializer(
@@ -130,8 +130,8 @@ def alloc_multi_vars_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp0 = T.alloc_var(dtype, 1)
@@ -151,14 +151,15 @@ def run_alloc_multi_vars_with_initializer(
     program = alloc_multi_vars_with_initializer(N, block_N, dtype)
 
     kernel = tilelang.compile(program, out_idx=[1])
-    code = kernel.get_kernel_source()
-    print(code)
+    code = kernel.get_kernel_source(kernel_only=True)
     assert code.count("= 1;") == 1
     assert code.count("= 2;") == 1
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_multi_vars_with_initializer():
-    run_alloc_multi_vars_with_initializer(256, 64, "int32")
+    run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
new file mode 100644
index 000000000..1647a296c
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -0,0 +1,80 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def test_tensor_annot_mul():
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 4,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 4):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_add():
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_mul_add():
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 3 + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 3 + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3d616ac1e..6dd13344e 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -6,12 +6,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
-
+def tilelang_copy(M, N, block_M, block_N, dtype=T.float16, pad_value=0):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -27,16 +26,9 @@ def main(
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16", pad_value=0):
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)
@@ -48,6 +40,7 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16",
     torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, pad_value=10)
 
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 354d32cd0..74db94f7c 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.any(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
new file mode 100644
index 000000000..06e92dfa9
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -0,0 +1,86 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def test_assume_remove_boundary_check():
+    @tilelang.jit
+    def kernel_with_assume():
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(A: T.Tensor((N,), T.float32), l: T.int32, r: T.int32):
+            with T.Kernel(1, threads=32) as _:
+                for i in T.serial(r - l + 1):
+                    T.assume(l + i >= 0 and l + i < N)
+                    A[l + i] = 0
+
+        return main
+
+    jit_kernel = kernel_with_assume()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+def test_assume_enable_vectorization():
+    @tilelang.jit
+    def kernel_vectorize(M):
+        N = T.dynamic("N")
+        vectorize_size = 4
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+
+                base_idx = tid * 4
+                T.assume(N % vectorize_size == 0)
+
+                for i in T.vectorized(vectorize_size):
+                    T.assume(base_idx + i < N)
+                    B[tid, base_idx + i] = A[tid, base_idx + i]
+
+        return main
+
+    jit_kernel = kernel_vectorize(128)
+    source = jit_kernel.get_kernel_source()
+
+    assert ("float4" in source) and ("if (" not in source)
+
+
+def test_assume_complex_indexing():
+    @tilelang.jit
+    def kernel_complex():
+        M = T.dynamic("M")
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+                for j in T.serial(N):
+                    i_src = T.min(j + 233, tid + 2)
+                    j_src = j * T.ceildiv(j, i_src) * j - 1
+
+                    T.assume(i_src >= 0 and i_src < M)
+                    T.assume(j_src >= 0 and j_src < N)
+
+                    B[tid, j] = A[i_src, j_src]
+
+        return main
+
+    jit_kernel = kernel_complex()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic.py b/testing/python/language/test_tilelang_language_atomic.py
new file mode 100644
index 000000000..ebb61f4c3
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_atomic.py
@@ -0,0 +1,617 @@
+import tilelang.testing
+import tilelang.layout
+import tilelang.language as T
+import torch
+
+
+# ======================= Thread-level atomic add =======================
+
+
+@tilelang.jit
+def atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_add
+
+
+def run_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_memory_order_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
+
+    return atomic_with_memory_order
+
+
+def run_atomic_memory_order(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_addx2_program(M, N, block_M, block_N, dtype=T.float16):
+    @T.prim_func
+    def atomic_addx2(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N // 2):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j * 2
+                T.atomic_addx2(B[idx_i, idx_j], A[idx_i, idx_j])
+
+    return atomic_addx2
+
+
+def run_atomic_addx2(M, N, block_M, block_N, dtype=T.float16):
+    kernel = atomic_addx2_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
+    B = torch.zeros(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
+    ref_B = B.clone()
+
+    for i in range(M):
+        for j in range(0, N - 1, 2):
+            ref_B[i, j] += A[i, j]
+            ref_B[i, j + 1] += A[i, j + 1]
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_different_orders(
+        A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    val = A[idx_i, idx_j]
+                    T.atomic_add(B[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_max(C[idx_i, idx_j], val, memory_order="relaxed")
+                    T.atomic_min(D[idx_i, idx_j], val, memory_order="relaxed")
+
+    return atomic_different_orders
+
+
+def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    D = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+
+    kernel(A, B, C, D)
+
+    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
+    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float("inf")), A))
+
+
+@tilelang.jit
+def atomic_addx4_program(M, N, block_M, block_N):
+    @T.prim_func
+    def atomic_addx4(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N // 4):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j * 4
+                T.atomic_addx4(B[idx_i, idx_j], A[idx_i, idx_j])
+
+    return atomic_addx4
+
+
+def run_atomic_addx4(M, N, block_M, block_N):
+    kernel = atomic_addx4_program(M, N, block_M, block_N)
+    import torch
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda()
+    B = torch.zeros(M, N, dtype=torch.float32).cuda()
+    ref_B = B.clone()
+
+    for i in range(M):
+        for j in range(0, N - 3, 4):
+            ref_B[i, j] += A[i, j]
+            ref_B[i, j + 1] += A[i, j + 1]
+            ref_B[i, j + 2] += A[i, j + 2]
+            ref_B[i, j + 3] += A[i, j + 3]
+
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_return_prev_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    old_vals[idx_i, idx_j] = T.atomic_add(B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
+
+    return atomic_with_return_prev
+
+
+def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 5.0
+    B = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 2.0
+    old_vals = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    initial_B = B.clone()
+    kernel(A, B, old_vals)
+
+    torch.testing.assert_close(old_vals, initial_B, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(B, initial_B + A, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tma_atomic_add_program(out, explicit_swizzle=False):
+    out: T.Tensor[(16, 16), T.float32]
+
+    with T.Kernel(
+        1,
+    ):
+        out_shared = T.alloc_shared((16, 16), dtype=T.float32)
+        if explicit_swizzle:
+            T.annotate_layout({out_shared: tilelang.layout.make_swizzled_layout(out_shared)})
+        T.fill(out_shared, 1)
+        for _ in range(16):
+            T.atomic_add(out, out_shared, use_tma=True)
+
+
+@tilelang.testing.requires_cuda
+def test_tma_atomic_add():
+    out = torch.zeros((16, 16), dtype=torch.float32, device="cuda")
+    tma_atomic_add_program(out)
+    torch.testing.assert_close(out, torch.ones((16, 16), dtype=torch.float32, device="cuda") * 16)
+
+    kernel = tma_atomic_add_program.compile(out=T.Tensor[(16, 16), T.float32])
+    assert "tma_store_add" in kernel.get_kernel_source()
+    assert "desc" in kernel.get_kernel_source()  # Ensure using cp.reduce.async.bulk.tensor
+
+    kernel_with_explicit_swizzle = tma_atomic_add_program.compile(out=T.Tensor[(16, 16), T.float32], explicit_swizzle=True)
+    # Ensure auto swizzled layout is applied
+    assert kernel.get_kernel_source() == kernel_with_explicit_swizzle.get_kernel_source()
+
+
+def run_atomic_add_auto_vectorized(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    assert "AtomicAddx4" in kernel.get_kernel_source()
+
+
+@tilelang.jit
+def atomic_add_complicated_parallel_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                value = A_shared[i, j]
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], value)
+
+    return atomic_add
+
+
+def run_atomic_add_complicated_parallel(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_add_complicated_parallel_program(K, M, N, block_M, block_N, dtype=dtype)
+    assert "float4 value" in kernel.get_kernel_source()
+    assert "AtomicAddx4" in kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_memory_order():
+    run_atomic_memory_order(4, 64, 64, 16, 16)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_addx2_half():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float16)
+
+
+def test_atomic_addx2_float():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float32)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_different_memory_orders():
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.bfloat16)
+
+
+def test_atomic_addx4():
+    run_atomic_addx4(16, 64, 4, 4)
+
+
+def test_atomic_return_prev():
+    run_atomic_return_prev(32, 32, 8, 8)
+
+
+def test_atomic_add():
+    run_atomic_add(8, 128, 128, 32, 32)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_atomic_add_auto_vectorized():
+    run_atomic_add_auto_vectorized(8, 128, 128, 32, 32)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_atomic_add_complicated_parallel():
+    run_atomic_add_complicated_parallel(8, 128, 128, 32, 32)
+
+
+# ======================= Tile-level atomic add =======================
+
+
+@tilelang.jit
+def tile_atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_add(B[bx * block_M, by * block_N], A_shared)
+
+    return atomic_add
+
+
+def run_tile_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_add_expr_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            T.atomic_add(A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], 1.0)
+
+    return atomic_add
+
+
+def run_tile_atomic_add_expr(M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_add_expr_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A):
+        for i in range(M):
+            for j in range(N):
+                A[i, j] += 1
+
+    A = torch.zeros(M, N, dtype=torch.float32).cuda()
+    ref_A = A.clone()
+    ref_program(ref_A)
+    kernel(A)
+    torch.testing.assert_close(A, ref_A, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_add_scalar_program(dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((1), dtype), B: T.Tensor((1), dtype)):
+        with T.Kernel(
+            1,
+        ) as _:
+            A_local = T.alloc_local([1], dtype)
+            T.copy(A, A_local)
+            T.clear(B)
+            T.atomic_add(B, A_local)
+            T.atomic_add(B, 1)
+
+    return atomic_add
+
+
+def run_tile_atomic_add_scalar(dtype=T.float32):
+    kernel = tile_atomic_add_scalar_program(dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        B[0] = A[0] + 1
+
+    A = torch.randn(1, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(1, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+def test_tile_atomic_add():
+    run_tile_atomic_add(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_add_expr():
+    run_tile_atomic_add_expr(128, 128, 32, 32)
+
+
+def test_tile_atomic_add_scalar():
+    run_tile_atomic_add_scalar()
+
+
+# ======================= Thread-level atomic max/min/load store =======================
+
+
+@tilelang.jit
+def atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_max
+
+
+def run_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = max(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_min
+
+
+def run_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = min(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_load_store_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    val = T.atomic_load(A[idx_i, idx_j])
+                    T.atomic_store(B[idx_i, idx_j], val)
+
+    return atomic_load_store
+
+
+def run_atomic_load_store(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    kernel(A, B)
+    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
+
+
+def test_atomic_max():
+    run_atomic_max(4, 64, 64, 16, 16)
+
+
+def test_atomic_min():
+    run_atomic_min(4, 64, 64, 16, 16)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_load_store():
+    run_atomic_load_store(64, 64, 16, 16)
+
+
+# ======================= Tile-level atomic max/min =======================
+
+
+@tilelang.jit
+def tile_atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def tile_atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_max(B[bx * block_M, by * block_N], A_shared)
+
+    return tile_atomic_max
+
+
+def run_tile_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = max(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("-inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def tile_atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_min(B[bx * block_M, by * block_N], A_shared)
+
+    return tile_atomic_min
+
+
+def run_tile_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = min(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_max_expr_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_max(A: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            T.atomic_max(A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], 0.5)
+
+    return atomic_max
+
+
+def run_tile_atomic_max_expr(M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_max_expr_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A):
+        for i in range(M):
+            for j in range(N):
+                A[i, j] = max(A[i, j], 0.5)
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda()
+    ref_A = A.clone()
+    ref_program(ref_A)
+    kernel(A)
+    torch.testing.assert_close(A, ref_A, atol=1e-3, rtol=1e-3)
+
+
+def test_tile_atomic_max():
+    run_tile_atomic_max(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_min():
+    run_tile_atomic_min(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_max_expr():
+    run_tile_atomic_max_expr(128, 128, 32, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
deleted file mode 100644
index 42c33e54d..000000000
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ /dev/null
@@ -1,381 +0,0 @@
-import tilelang.testing
-import tilelang.language as T
-
-
-@tilelang.jit
-def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
-            A_shared = T.alloc_shared((block_M, block_N), dtype)
-
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
-
-            for i, j in T.Parallel(block_M, block_N):
-                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
-
-    return atomic_add
-
-
-def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    def ref_program(A, B):
-        for k in range(K):
-            for i in range(M):
-                for j in range(N):
-                    B[i, j] += A[k, i, j]
-
-    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    ref_B = B.clone()
-    ref_program(A, ref_B)
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
-            A_shared = T.alloc_shared((block_M, block_N), dtype)
-
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
-
-            T.atomic_add(B[bx * block_M, by * block_N], A_shared)
-
-    return atomic_add
-
-
-def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
-    kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
-    print(kernel.get_kernel_source())
-    import torch
-
-    def ref_program(A, B):
-        for k in range(K):
-            for i in range(M):
-                for j in range(N):
-                    B[i, j] += A[k, i, j]
-
-    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    ref_B = B.clone()
-    ref_program(A, ref_B)
-    kernel(A, B)
-    print(B)
-    print(ref_B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
-            A_shared = T.alloc_shared((block_M, block_N), dtype)
-
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
-
-            for i, j in T.Parallel(block_M, block_N):
-                T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
-
-    return atomic_max
-
-
-def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    def ref_program(A, B):
-        for k in range(K):
-            for i in range(M):
-                for j in range(N):
-                    B[i, j] = max(B[i, j], A[k, i, j])
-
-    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    ref_B = B.clone()
-    ref_program(A, ref_B)
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
-            A_shared = T.alloc_shared((block_M, block_N), dtype)
-
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
-
-            for i, j in T.Parallel(block_M, block_N):
-                T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
-
-    return atomic_min
-
-
-def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    def ref_program(A, B):
-        for k in range(K):
-            for i in range(M):
-                for j in range(N):
-                    B[i, j] = min(B[i, j], A[k, i, j])
-
-    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
-    ref_B = B.clone()
-    ref_program(A, ref_B)
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                idx_i = bx * block_M + i
-                idx_j = by * block_N + j
-                if idx_i < M and idx_j < N:
-                    val = T.atomic_load(A[idx_i, idx_j])
-                    T.atomic_store(B[idx_i, idx_j], val)
-
-    return atomic_load_store
-
-
-def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    kernel(A, B)
-    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
-            A_shared = T.alloc_shared((block_M, block_N), dtype)
-
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
-
-            for i, j in T.Parallel(block_M, block_N):
-                T.atomic_add(
-                    B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
-
-    return atomic_with_memory_order
-
-
-def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    def ref_program(A, B):
-        for k in range(K):
-            for i in range(M):
-                for j in range(N):
-                    B[i, j] += A[k, i, j]
-
-    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    ref_B = B.clone()
-    ref_program(A, ref_B)
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_addx2_program(M, N, block_M, block_N):
-
-    @T.prim_func
-    def atomic_addx2(A: T.Tensor((M, N), "float16"), B: T.Tensor((M, N), "float16")):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N // 2):
-                idx_i = bx * block_M + i
-                idx_j = by * block_N + j * 2
-                T.atomic_addx2(B[idx_i, idx_j], A[idx_i, idx_j])
-
-    return atomic_addx2
-
-
-def run_atomic_addx2(M, N, block_M, block_N):
-    kernel = atomic_addx2_program(M, N, block_M, block_N)
-    import torch
-
-    A = torch.randn(M, N, dtype=torch.float16).cuda()
-    B = torch.zeros(M, N, dtype=torch.float16).cuda()
-    ref_B = B.clone()
-
-    for i in range(M):
-        for j in range(0, N - 1, 2):
-            ref_B[i, j] += A[i, j]
-            ref_B[i, j + 1] += A[i, j + 1]
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_different_orders(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor(
-        (M, N), dtype), D: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                idx_i = bx * block_M + i
-                idx_j = by * block_N + j
-                if idx_i < M and idx_j < N:
-                    val = A[idx_i, idx_j]
-                    T.atomic_add(B[idx_i, idx_j], val, memory_order="relaxed")
-                    T.atomic_max(C[idx_i, idx_j], val, memory_order="acquire")
-                    T.atomic_min(D[idx_i, idx_j], val, memory_order="release")
-
-    return atomic_different_orders
-
-
-def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    D = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
-
-    kernel(A, B, C, D)
-
-    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
-    torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
-    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float('inf')), A))
-
-
-def test_atomic_add():
-    run_atomic_add(8, 128, 128, 32, 32)
-
-
-def test_atomic_max():
-    run_atomic_max(4, 64, 64, 16, 16)
-
-
-def test_atomic_min():
-    run_atomic_min(4, 64, 64, 16, 16)
-
-
-def test_atomic_load_store():
-    run_atomic_load_store(64, 64, 16, 16)
-
-
-def test_atomic_memory_order():
-    run_atomic_memory_order(4, 64, 64, 16, 16)
-
-
-def test_atomic_addx2():
-    run_atomic_addx2(32, 64, 8, 16)
-
-
-@tilelang.jit
-def atomic_addx4_program(M, N, block_M, block_N):
-
-    @T.prim_func
-    def atomic_addx4(A: T.Tensor((M, N), "float32"), B: T.Tensor((M, N), "float32")):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N // 4):
-                idx_i = bx * block_M + i
-                idx_j = by * block_N + j * 4
-                T.atomic_addx4(B[idx_i, idx_j], A[idx_i, idx_j])
-
-    return atomic_addx4
-
-
-def run_atomic_addx4(M, N, block_M, block_N):
-    kernel = atomic_addx4_program(M, N, block_M, block_N)
-    import torch
-
-    A = torch.randn(M, N, dtype=torch.float32).cuda()
-    B = torch.zeros(M, N, dtype=torch.float32).cuda()
-    ref_B = B.clone()
-
-    for i in range(M):
-        for j in range(0, N - 3, 4):
-            ref_B[i, j] += A[i, j]
-            ref_B[i, j + 1] += A[i, j + 1]
-            ref_B[i, j + 2] += A[i, j + 2]
-            ref_B[i, j + 3] += A[i, j + 3]
-
-    kernel(A, B)
-    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
-
-
-@tilelang.jit
-def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
-
-    @T.prim_func
-    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype),
-                                old_vals: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                idx_i = bx * block_M + i
-                idx_j = by * block_N + j
-                if idx_i < M and idx_j < N:
-                    old_vals[idx_i, idx_j] = T.atomic_add(
-                        B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
-
-    return atomic_with_return_prev
-
-
-def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
-    kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
-    import torch
-
-    A = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 5.0
-    B = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 2.0
-    old_vals = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-
-    initial_B = B.clone()
-    kernel(A, B, old_vals)
-
-    torch.testing.assert_close(old_vals, initial_B, atol=1e-3, rtol=1e-3)
-    torch.testing.assert_close(B, initial_B + A, atol=1e-3, rtol=1e-3)
-
-
-def test_atomic_different_memory_orders():
-    run_atomic_different_memory_orders(32, 32, 8, 8)
-
-
-def test_atomic_addx4():
-    run_atomic_addx4(16, 64, 4, 4)
-
-
-def test_atomic_return_prev():
-    run_atomic_return_prev(32, 32, 8, 8)
-
-
-# TODO(lei): test failed and this is experimental
-# CC @dyq
-# def test_tile_atomic_add():
-#     run_tile_atomic_add(8, 128, 128, 32, 32)
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ceildiv.py b/testing/python/language/test_tilelang_language_ceildiv.py
index 35201a074..f5af31b83 100644
--- a/testing/python/language/test_tilelang_language_ceildiv.py
+++ b/testing/python/language/test_tilelang_language_ceildiv.py
@@ -5,9 +5,8 @@
 
 @tilelang.jit(out_idx=[-1])
 def _ceildiv_kernel(a: int, b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32")):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32)):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
@@ -30,9 +29,8 @@ def test_ceildiv():
 
 @tilelang.jit
 def _ceildiv_kernel_dyn(b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32"), a: T.int32):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32), a: T.int32):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
index 696a9c70b..083eefdcb 100644
--- a/testing/python/language/test_tilelang_language_chain_equal.py
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -8,14 +8,14 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },)
-def chain_equal(N, block_size, dtype="float32"):
-
+    },
+)
+def chain_equal(N, block_size, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as bx:
             for lane in T.Parallel(block_size):
@@ -25,7 +25,7 @@ def main(
     return main
 
 
-def run_chain_equal(N=128, block_size=64, dtype="float32"):
+def run_chain_equal(N=128, block_size=64, dtype=T.float32):
     kernel = chain_equal(N, block_size, dtype)
     A = torch.zeros((N,), dtype=torch.float32, device="cuda")
     B = torch.zeros((N,), dtype=torch.float32, device="cuda")
diff --git a/testing/python/language/test_tilelang_language_clamp.py b/testing/python/language/test_tilelang_language_clamp.py
index 4a2f17791..372d74784 100644
--- a/testing/python/language/test_tilelang_language_clamp.py
+++ b/testing/python/language/test_tilelang_language_clamp.py
@@ -1,5 +1,5 @@
 import tilelang.testing
-from tilelang.utils.tensor import map_torch_type
+from tilelang import language as T
 
 
 def clamp_within_bounds(
@@ -13,8 +13,8 @@ def clamp_within_bounds(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -56,8 +56,8 @@ def clamp_value_range(
 
     @T.prim_func
     def main(
-            A: T.Tensor((1, N), dtype),
-            B: T.Tensor((1, N), dtype),
+        A: T.Tensor((1, N), dtype),
+        B: T.Tensor((1, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             # A_shared = T.alloc_shared([1, block_N], dtype=dtype)
@@ -91,7 +91,7 @@ def run_clamp_value_range(
     import torch
 
     # Convert string dtype to torch.dtype
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = dtype.as_torch()
 
     def ref_program(A):
         min_val = torch.min(A) * 0.5
@@ -107,10 +107,10 @@ def ref_program(A):
 
 def test_clamp():
     # clamp tests for float16 and float32
-    run_clamp(1024, 128, "float16", -0.05, 0.05)
-    run_clamp(1024, 128, "float32", -0.06, 0.05)
-    run_clamp_value_range(1024, 128, "float16")
-    run_clamp_value_range(1024, 128, "float32")
+    run_clamp(1024, 128, T.float16, -0.05, 0.05)
+    run_clamp(1024, 128, T.float32, -0.06, 0.05)
+    run_clamp_value_range(1024, 128, T.float16)
+    run_clamp_value_range(1024, 128, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index be3d808f4..2e4c732fc 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -40,12 +39,12 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2], pass_configs={"tl.disable_tma_lower": True})
     import torch
     from tilelang.utils import map_torch_type
+
     a = torch.randn((M, K), dtype=map_torch_type(dtype)).cuda()
     b = torch.randn((N, K), dtype=map_torch_type(dtype)).cuda()
     c = kernel(a, b)
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index ac2254f30..09f9ad9c4 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -6,12 +6,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_composable_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M * N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M * N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,16 +25,16 @@ def main(
     return main
 
 
-def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_composable_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b.flatten(), a.flatten(), rtol=1e-2, atol=1e-2)
@@ -44,7 +43,7 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
 def test_tilelang_copy():
     run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
+    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_cooperative.py b/testing/python/language/test_tilelang_language_cooperative.py
new file mode 100644
index 000000000..0a4d7a6df
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_cooperative.py
@@ -0,0 +1,40 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+
+
+@tilelang.jit
+def grid_sync(N=1024):
+    block = 64
+
+    @T.prim_func
+    def kernel(A: T.Tensor((N), T.float32)):
+        with T.Kernel(T.ceildiv(N, block), threads=128) as bx:
+            A_local = T.alloc_fragment((block), dtype=T.float32)
+            n_idx = bx * block
+            for i in T.Parallel(block):
+                A[n_idx + i] = n_idx + i
+            T.sync_grid()
+            for i in T.Parallel(block):
+                A_local[i] = A[N - n_idx - i - 1]
+                T.sync_grid()
+                A[n_idx + i] = A[n_idx + i] + A_local[i]
+
+    return kernel
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(6, 0)
+def test_grid_sync():
+    N = 1024
+    kernel = grid_sync(N)
+    assert "cooperative_groups::this_grid().sync()" in kernel.get_kernel_source()
+    tensor = torch.rand((N), dtype=torch.float32, device="cuda")
+    kernel(tensor)
+    target = torch.full_like(tensor, tensor[0])
+    torch.testing.assert_close(tensor, target)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 4a2ddee8e..7b22cc34e 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -3,34 +3,36 @@
 import torch
 import tilelang.testing
 
+print(torch.__version__)
+
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy(M, N, block_M, block_N, src_dtype=T.float16, dst_dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), src_dtype),
+        B: T.Tensor((M, N), dst_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+                B[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+            )
 
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
-    program = tilelang_copy(M, N, block_M, block_N, dtype)
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=dtype, dst_dtype=dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
+    source = kernel.get_kernel_source()
+    print(source)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -39,15 +41,14 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
-
+    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
-def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.StridedTensor((M, N), (NN, 1), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.StridedTensor((M, N), (NN, 1), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -57,23 +58,18 @@ def main(
     return main
 
 
-def run_tilelang_copy_with_stride(M=1024,
-                                  N=1024,
-                                  NN=2048,
-                                  block_M=128,
-                                  block_N=128,
-                                  dtype="float16"):
+def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype=T.float16):
     if isinstance(NN, int):
         assert NN > N, "NN must be greater than N"
     program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
+        },
+    )
     if isinstance(NN, T.Var):
         NN = N * 2
     a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
@@ -86,43 +82,39 @@ def test_tilelang_copy_with_stride():
     run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.dynamic("NN"), block_M=128, block_N=128)
 
 
-def tilelang_copy_bufferload(num_tokens, dtype="float16"):
-
+def tilelang_copy_bufferload(num_tokens, dtype=T.float16):
     @T.prim_func
     def main(
-            indices: T.Tensor((num_tokens,), "int32"),
-            x: T.Tensor((num_tokens,), dtype),
+        indices: T.Tensor((num_tokens,), T.int32),
+        x: T.Tensor((num_tokens,), dtype),
     ):
         with T.Kernel(num_tokens, threads=32) as pid:
-            idx = T.alloc_local([1], "int32")
+            idx = T.alloc_local([1], T.int32)
             T.copy(indices[pid], idx[0])
             x[idx[0]] = x[idx[0]] + 1
 
     return main
 
 
-def run_tilelang_copy_bufferload(num_tokens=128, dtype="float16"):
+def run_tilelang_copy_bufferload(num_tokens=128, dtype=T.float16):
     program = tilelang_copy_bufferload(num_tokens, dtype)
     # test compilation only
     tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
 
 
 def test_tilelang_copy_bufferload():
     run_tilelang_copy_bufferload(num_tokens=128)
 
 
-def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -132,20 +124,13 @@ def main(
     return main
 
 
-def run_tilelang_copy_buffer_load_with_parallel(M=1024,
-                                                N=1024,
-                                                block_M=128,
-                                                block_N=128,
-                                                dtype="float16"):
+def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -155,5 +140,48 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
+def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp8_e8_t" in source
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8).view(torch.float8_e8m0fnu)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp8_e8m0():
+    run_tilelang_copy_fp8_e8m0(src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu)
+
+
+def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp4_e2_t" in source
+    # For FP4, use same shape as kernel expects, since int8 is used as storage type
+    dummy_input = torch.randint(0, 100, (M, N // 2), device="cuda", dtype=torch.int8)
+    output = kernel(dummy_input)
+    if src_dtype == dst_dtype:
+        assert torch.allclose(output.view(torch.int8), dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp4():
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float16)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.bfloat16)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index 004640535..fecc0d2a8 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -2,15 +2,14 @@
 import tilelang.testing
 import tilelang as tl
 import torch
+import tilelang.language as T
 
 
-def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
-    import tilelang.language as T
-
+def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -23,13 +22,13 @@ def cumsum(
     return cumsum
 
 
-def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -44,7 +43,7 @@ def cumsum(
     return cumsum
 
 
-def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test(M, N, block_M, block_N, dim, reverse, dtype)
     elif scope == "fragment":
@@ -57,13 +56,16 @@ def ref_program(A):
         ref_b = torch.empty_like(A)
         for i in range(M // block_M):
             for j in range(N // block_N):
-                ref_b[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = A[i * block_M:(i + 1) * block_M, j *
-                                                         block_N:(j + 1) * block_N].cumsum(dim=dim)
+                ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = A[
+                    i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N
+                ].cumsum(dim=dim)
                 if reverse:
-                    ref_b[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                          block_N] = A[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                                       block_N].flip(dims=[dim]).cumsum(dim=dim).flip(dims=[dim])
+                    ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = (
+                        A[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N]
+                        .flip(dims=[dim])
+                        .cumsum(dim=dim)
+                        .flip(dims=[dim])
+                    )
         return ref_b
 
     tilelang_res = jit_kernel(A)
@@ -71,13 +73,13 @@ def ref_program(A):
     torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
 
 
-def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_smem_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -89,13 +91,13 @@ def cumsum(
     return cumsum
 
 
-def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -109,7 +111,7 @@ def cumsum(
     return cumsum
 
 
-def run_cumsum_1d(N, block_N, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum_1d(N, block_N, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test_1d(N, block_N, reverse, dtype)
     elif scope == "fragment":
@@ -147,8 +149,8 @@ def test_cumsum_smem():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True)
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32")
-    run_cumsum(256, 256, 128, 128, dtype="float32")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
 
 
 def test_cumsum_fragment():
@@ -157,8 +159,8 @@ def test_cumsum_fragment():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True, scope="fragment")
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
 
 
 def test_cumsum_smem_1d():
@@ -171,5 +173,139 @@ def test_cumsum_fragment_1d():
     run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
 
 
+def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((N,), dtype),
+        OutputG_fragment: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, chunk_size), threads=chunk_size) as bx:
+            i = bx
+            chunk_start = i * chunk_size
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((chunk_size,), dtype)
+            T.copy(InputG_fragment[chunk_start : chunk_start + chunk_size], A_shared)
+            # Test cumsum with region input - in-place operation on shared memory
+            # This demonstrates the feature: T.cumsum(region, dim=0)
+            T.cumsum(src=A_shared, dim=0, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(A_shared, OutputG_fragment[chunk_start : chunk_start + chunk_size])
+
+    return cumsum_region
+
+
+def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Run test for cumsum with region input."""
+    program = cumsum_region_test_1d(N, chunk_size, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks = (N + chunk_size - 1) // chunk_size
+        for j in range(num_blocks):
+            start = j * chunk_size
+            end = min(start + chunk_size, N)
+            chunk = A[start:end].clone()
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            chunk = chunk.cumsum(dim=0)
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            ref_b[start:end] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input in 2D."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((M, N), dtype),
+        OutputG_fragment: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
+            chunk_start_M = by * block_M
+            chunk_start_N = bx * block_N
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(
+                InputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+                A_shared,
+            )
+            # Test cumsum with 2D region input - in-place operation on shared memory
+            T.cumsum(src=A_shared, dim=dim, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(
+                A_shared,
+                OutputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+            )
+
+    return cumsum_region
+
+
+def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Run test for cumsum with 2D region input."""
+    program = cumsum_region_test_2d(M, N, block_M, block_N, dim, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks_M = (M + block_M - 1) // block_M
+        num_blocks_N = (N + block_N - 1) // block_N
+        for i in range(num_blocks_M):
+            for j in range(num_blocks_N):
+                start_M = i * block_M
+                end_M = min(start_M + block_M, M)
+                start_N = j * block_N
+                end_N = min(start_N + block_N, N)
+                chunk = A[start_M:end_M, start_N:end_N].clone()
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                chunk = chunk.cumsum(dim=dim)
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                ref_b[start_M:end_M, start_N:end_N] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def test_cumsum_region_1d():
+    """Test cumsum with 1D region input."""
+    # Test normal cumsum with region input
+    run_cumsum_region_1d(1024, 128)
+    # Test reverse cumsum with region input
+    run_cumsum_region_1d(1024, 128, reverse=True)
+    # Test with different chunk sizes
+    run_cumsum_region_1d(512, 64)
+    run_cumsum_region_1d(2048, 256)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_1d(1000, 128)
+
+
+def test_cumsum_region_2d():
+    """Test cumsum with 2D region input."""
+    # Test 2D cumsum along dim 0
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=0)
+    # Test 2D cumsum along dim 1
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=1)
+    # Test reverse cumsum
+    run_cumsum_region_2d(512, 512, 64, 64, dim=1, reverse=True)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_2d(1000, 1000, 128, 128, dim=1)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_eager_jit.py b/testing/python/language/test_tilelang_language_eager_jit.py
new file mode 100644
index 000000000..28f8fe8aa
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_eager_jit.py
@@ -0,0 +1,242 @@
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+from itertools import product
+import torch
+
+
+def test_jit2_gemm():
+    @tilelang.jit(verbose=True)
+    def gemm(
+        A,
+        B,
+        C,
+        dtype: T.dtype = T.float16,
+        accum_dtype: T.dtype = T.float32,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 64,
+    ):
+        M, N, K = T.const("M N K")
+
+        A: T.Tensor[[M, K], dtype]
+        B: T.Tensor[[K, N], dtype]
+        C: T.Tensor[[M, N], dtype]
+
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N)) as (by, bx):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    A = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    B = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    C = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    gemm(A, B, C)
+    C_ref = A @ B
+    torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_gemm_ptr():
+    @tilelang.jit
+    def gemm_ptr(
+        A: T.ptr,
+        B: T.ptr,
+        C: T.ptr,
+        M: int,
+        N: int,
+        K: int,
+        dtype: T.dtype,
+        out_dtype: T.dtype,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        A = T.make_tensor(A, (M, K), dtype)
+        B = T.make_tensor(B, (K, N), dtype)
+        C = T.make_tensor(C, (M, N), out_dtype)
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[bx * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[bx * block_M, by * block_N])
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm_ptr.par_compile(
+        [
+            {"A": T.ptr(), "B": T.ptr(), "C": T.ptr(), "M": 1024, "N": 1024, "K": 1024, "dtype": in_dtype, "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        C_ref = out_dtype(A @ B)
+        C = torch.empty(1024, 1024, dtype=out_dtype, device="cuda")
+        gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
+        torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_many_annot():
+    @T.macro
+    def copy_impl(A, B):
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+
+    @tilelang.jit
+    def copy1(A, B):
+        N, M = T.const("N, M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+        B: T.Tensor[[128, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy3(A, B):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        B: T.Tensor[[N, 128], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy4(A, B):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy5(A, B):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy6(A, B):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    tilelang.par_compile([copy.get_tir(T.Tensor((128, 128)), T.Tensor((128, 128))) for copy in [copy1, copy2, copy3, copy4]])
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = torch.empty(128, 128, device="cuda")
+        copy(A, B)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = torch.randn(128, 2, 128, 2, device="cuda")
+        copy(A[:, 0, :, 0], B[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
+
+
+def test_jit2_return():
+    @T.macro
+    def copy_impl(A):
+        M, N = A.shape
+        B = T.empty(M, N, dtype=A.dtype)
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+        return B
+
+    @tilelang.jit
+    def copy1(A):
+        M, N = T.const("M, N")
+        A: T.Tensor[[M, N], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy2(A):
+        A: T.Tensor[[128, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy3(A):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy4(A):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy5(A):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy6(A):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = copy(A)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = copy(A[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B)
+
+
+def test_jit2_compile_with_consts():
+    @tilelang.jit
+    def transpose(X, Y, block_M, block_N):
+        M, N = T.const("M N")
+        X: T.Tensor[[M, N], T.float32]
+        Y: T.Tensor[[N, M], T.float32]
+
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            X_tile = T.alloc_shared((block_M, block_N), T.float32)
+            Y_tile = T.alloc_shared((block_N, block_M), T.float32)
+
+            T.copy(X[bx * block_M, by * block_N], X_tile)
+            for i, j in T.Parallel(block_M, block_N):
+                Y_tile[j, i] = X_tile[i, j]
+            T.copy(Y_tile, Y[by * block_N, bx * block_M])
+
+    transpose.compile(M=1024, N=1024, block_M=64, block_N=64)
+
+
+if __name__ == "__main__":
+    # tilelang.testing.main()
+    test_jit2_return()
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 915574c3e..8966b36ed 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -4,11 +4,11 @@
 import tilelang.testing
 import tvm
 from tvm.script.ir_builder.base import IRBuilderFrame
-from tvm.tir.expr import IntImm, Var
+from tvm.tir.expr import IntImm, Var, Not, Or
+from tvm.tir import all as tir_all
 
 
 def test_argument():
-
     @T.prim_func
     def test_argument(
         t_1: T.bool,
@@ -40,7 +40,8 @@ def test_argument(
 
 
 def test_expr():
-    from tilelang.language.v2.dtypes import _all_dtypes
+    from tilelang.language.dtypes import _all_dtypes
+
     errors = []
     for name in _all_dtypes:
         dtype = getattr(T, name)
@@ -116,98 +117,97 @@ def test_expr():
 
 
 def test_dtype_str_repr():
-
     @T.prim_func
     def test_str_repr():
-        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope='shared')  # noqa F841
-        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope='shared')  # noqa F841
-        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope='shared')  # noqa F841
-        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope='shared')  # noqa F841
-        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope='shared')  # noqa F841
-        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope='shared')  # noqa F841
-        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope='shared')  # noqa F841
-        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope='shared')  # noqa F841
-        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope='shared')  # noqa F841
-        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope='shared')  # noqa F841
-        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope='shared')  # noqa F841
-        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope='shared')  # noqa F841
-        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope='shared')  # noqa F841
-        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope='shared')  # noqa F841
-        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope='shared')  # noqa F841
-        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope='shared')  # noqa F841
-        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope='shared')  # noqa F841
-        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope='shared')  # noqa F841
-        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope='shared')  # noqa F841
-        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope='shared')  # noqa F841
-        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope='shared')  # noqa F841
-        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope='shared')  # noqa F841
-
-
-def test_torch_eq():
-    dtypes = [
-        T.bool,
-        T.short,
-        T.int,
-        T.long,
-        T.half,
-        T.float,
-        T.long,
-        T.int8,
-        T.int16,
-        T.int32,
-        T.int64,
-        T.uint8,
-        T.uint16,
-        T.uint32,
-        T.uint64,
-        T.float8_e4m3fn,
-        T.float8_e4m3fnuz,
-        T.float8_e5m2,
-        T.float8_e5m2fnuz,
-        T.float8_e8m0fnu,
-        T.float16,
-        T.bfloat16,
-        T.float32,
-        T.float64,
-    ]
-    torch_dtypes = [
-        torch.bool,
-        torch.short,
-        torch.int,
-        torch.long,
-        torch.half,
-        torch.float,
-        torch.long,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.uint8,
-        torch.uint16,
-        torch.uint32,
-        torch.uint64,
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
-        torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e8m0fnu,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-        torch.float64,
-    ]
-    for a, b in zip(dtypes, torch_dtypes):
-        assert a == b, f"{a} and {b} are not equal"
-        assert T.dtype(b) == a, "dtype conversion error"
+        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope="shared")  # noqa F841
+        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope="shared")  # noqa F841
+        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope="shared")  # noqa F841
+        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope="shared")  # noqa F841
+        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope="shared")  # noqa F841
+        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope="shared")  # noqa F841
+        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope="shared")  # noqa F841
+        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope="shared")  # noqa F841
+        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope="shared")  # noqa F841
+        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope="shared")  # noqa F841
+        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope="shared")  # noqa F841
+        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope="shared")  # noqa F841
+        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope="shared")  # noqa F841
+        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope="shared")  # noqa F841
+        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope="shared")  # noqa F841
+        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope="shared")  # noqa F841
+        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope="shared")  # noqa F841
+        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope="shared")  # noqa F841
+        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope="shared")  # noqa F841
+        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope="shared")  # noqa F841
+        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope="shared")  # noqa F841
+        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope="shared")  # noqa F841
+
+
+# not supported now
+# def test_torch_eq():
+#     dtypes = [
+#         T.bool,
+#         T.short,
+#         T.int,
+#         T.long,
+#         T.half,
+#         T.float,
+#         T.long,
+#         T.int8,
+#         T.int16,
+#         T.int32,
+#         T.int64,
+#         T.uint8,
+#         T.uint16,
+#         T.uint32,
+#         T.uint64,
+#         T.float8_e4m3fn,
+#         T.float8_e4m3fnuz,
+#         T.float8_e5m2,
+#         T.float8_e5m2fnuz,
+#         T.float8_e8m0fnu,
+#         T.float16,
+#         T.bfloat16,
+#         T.float32,
+#         T.float64,
+#     ]
+#     torch_dtypes = [
+#         torch.bool,
+#         torch.short,
+#         torch.int,
+#         torch.long,
+#         torch.half,
+#         torch.float,
+#         torch.long,
+#         torch.int8,
+#         torch.int16,
+#         torch.int32,
+#         torch.int64,
+#         torch.uint8,
+#         torch.uint16,
+#         torch.uint32,
+#         torch.uint64,
+#         torch.float8_e4m3fn,
+#         torch.float8_e4m3fnuz,
+#         torch.float8_e5m2,
+#         torch.float8_e5m2fnuz,
+#         torch.float8_e8m0fnu,
+#         torch.float16,
+#         torch.bfloat16,
+#         torch.float32,
+#         torch.float64,
+#     ]
+#     for a, b in zip(dtypes, torch_dtypes):
+#         assert a == b, f"{a} and {b} are not equal"
+#         assert T.dtype(b) == a, "dtype conversion error"
 
 
 def test_var_assign():
-
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_var_assign(A: T.Tensor((2,), T.int32)):
+    @tilelang.jit
+    def test_var_assign():
+        A = T.empty((2,), T.int32)
         with T.Kernel(1) as _:
             a: T.int32 = 1
             b: T.int32 = a
@@ -215,14 +215,14 @@ def test_var_assign(A: T.Tensor((2,), T.int32)):
             d: T.int32 = a
             A[0] = b
             A[1] = d
+        return A
 
-    res = test_var_assign()()
+    res = test_var_assign()
     assert res[0] == 1
     assert res[1] == 2
 
 
 def test_marco_return():
-
     @T.macro
     def macro_return_constant():
         return 0
@@ -251,35 +251,15 @@ def test_macro_return():
             c = macro_return_expr(4.0)
             d = macro_apply_func(5.0, lambda x: x * 2.0)
             check(a, (int, float, T.PrimExpr))
-            check(b, T.PrimExpr)
-            check(c, T.PrimExpr)
-            check(d, T.PrimExpr)
-
-
-def test_prim_func_generator():
-
-    @T.prim_func(generator=True)
-    def prim_func_gen(
-            A=T.Tensor((128,), T.float32),  # noqa: B008
-            B=T.Tensor((128,), T.float32),  # noqa: B008
-    ):
-        with T.Kernel(128) as (tx,):
-            T.copy(A[tx], B[tx])
-
-    prim_func_gen()
-
-    @T.prim_func
-    def foo() -> T.Tensor((128,), T.float32):
-        pass
-
-    assert isinstance(foo, T.PrimFunc)
+            check(b, (int, float, T.PrimExpr))
+            check(c, (int, float, T.PrimExpr))
+            check(d, (int, float, T.PrimExpr))
 
 
 def test_serial_for_with_step():
-
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_stepped_serial(A: T.Tensor((10,), T.int32)):
+    @tilelang.jit
+    def stepped_serial():
+        A = T.empty((10,), T.int32)
         with T.Kernel(1) as _:
             for i in range(0, 10, 2):
                 T.device_assert(0 <= i < 10 and i % 2 == 0, "i out of range")
@@ -287,36 +267,35 @@ def test_stepped_serial(A: T.Tensor((10,), T.int32)):
             for i in range(1, 10, 2):
                 T.device_assert(1 <= i < 10 and i % 2 == 1, "i out of range")
                 A[i] = 2.0
+        return A
 
-    ker = test_stepped_serial()
-    res = ker()
-    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device='cuda')
+    res = stepped_serial()
+    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_serial_step_neg(A: T.Tensor((10,), T.int32)):
+    @tilelang.jit
+    def stepped_serial_neg():
+        A = T.empty((10,), T.int32)
         with T.Kernel(1) as _:
             for i in range(10, 0, -1):
                 T.device_assert(0 < i <= 10, "i out of range")
                 A[10 - i] = i
+        return A
 
-    ker = test_serial_step_neg()
-    res = ker()
-    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device='cuda')
+    res = stepped_serial_neg()
+    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     assert isinstance(T.serial(1, 10, 1), IRBuilderFrame)
-    assert isinstance(T.serial(1, 10, IntImm('int32', 1)), IRBuilderFrame)
-    assert not isinstance(T.serial(1, 10, Var('tmp', 'int32')), IRBuilderFrame)
+    assert isinstance(T.serial(1, 10, IntImm(T.int32, 1)), IRBuilderFrame)
+    assert not isinstance(T.serial(1, 10, Var("tmp", T.int32)), IRBuilderFrame)
     assert not isinstance(T.serial(10, -1, -1), IRBuilderFrame)
 
 
 def test_swap_logic():
-
     @tilelang.jit
-    @T.prim_func
-    def swap_var(A: T.Tensor[(2,), T.float32]):
+    def swap_var(A):
+        A: T.Tensor[(2,), T.float32]
         with T.Kernel(1, threads=1) as _:
             a = T.alloc_var(T.float32, A[0])
             b = T.alloc_var(T.float32, A[1])
@@ -324,23 +303,179 @@ def swap_var(A: T.Tensor[(2,), T.float32]):
             A[0], A[1] = a, b
 
     @tilelang.jit
-    @T.prim_func
-    def swap_idx(A: T.Tensor[(2,), T.float32]):
+    def swap_idx(A):
+        A: T.Tensor[(2,), T.float32]
         with T.Kernel(1, threads=1) as _:
             A[0], A[1] = A[1], A[0]
 
-    k_swap_var = swap_var()
     data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
-    k_swap_var(data)
+    swap_var(data)
     ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
+
     torch.testing.assert_close(data, ref)
 
-    k_swap_idx = swap_idx()
     data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
-    k_swap_idx(data)
+    swap_idx(data)
     ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
     torch.testing.assert_close(data, ref)
 
 
-if __name__ == '__main__':
+# TODO(Gong): ROCm is not supported alloc_var with initializer
+@tilelang.testing.requires_cuda
+def test_while_loop():
+    @tilelang.jit
+    def while_loop():
+        A = T.empty((1,), T.int32)
+        with T.Kernel(1) as _:
+            i = T.alloc_var(T.int32, 0)
+            sum = T.alloc_var(T.int32)
+            while i < 10:
+                sum += i
+                i += 1
+            A[0] = sum
+        return A
+
+    res = while_loop()
+    assert res[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {res[0].item()}"
+
+
+def test_var_macro():
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+
+def test_frame_inside_macro():
+    @tilelang.jit
+    def get_sample_kernel():
+        @T.macro
+        def transform(x):
+            return x + 1
+
+        @T.prim_func
+        def sample_kernel(
+            num_blocks: T.int32,
+            idx_out: T.Tensor[(32,), T.int32],
+        ):
+            with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
+                fragment = T.alloc_fragment(32, T.int32)
+                T.copy(idx_out, fragment)
+
+                for i in T.Parallel(32):
+                    idx_out[i] = transform(fragment[i])
+
+        return sample_kernel
+
+    kernel = get_sample_kernel()  # noqa: F841
+
+
+def test_buffer_slice_step():
+    try:
+
+        @T.prim_func
+        def prim_buffer_slice_step(A: T.Buffer((10,), T.int32), B: T.Buffer((5,), T.int32)):
+            with T.Kernel(1):
+                B[0:5:2] = A[0:10:2]
+
+        raise AssertionError("Expect to report an error, buffer slice with step is not supported")
+    except RuntimeError:
+        pass
+
+
+def test_boolop():
+    a = Var("a", T.int32)
+    b = Var("b", T.int32)
+    c = Var("c", T.int32)
+    d = Var("d", T.int32)
+
+    def cond():
+        return Or(Not(tir_all(a < b, b < c, a * d < b * d)), b * d < c * d)
+
+    cond()
+
+
+def test_constexpr_if():
+    @tilelang.jit
+    def probe(A, tmp: bool):
+        A: T.Tensor[(2,), T.int32]
+        with T.Kernel(1):
+            if tmp:
+                v = A[0]
+            else:
+                v = A[1]
+            if tmp:
+                A[1] = v + 1
+            else:
+                A[0] = v + 1
+
+    A = torch.tensor([10, 20], dtype=torch.int32).cuda()
+    expect_1 = torch.tensor([10, 11], dtype=torch.int32).cuda()
+    expect_2 = torch.tensor([12, 11], dtype=torch.int32).cuda()
+    probe(A, True)
+    assert torch.equal(A, expect_1)
+    probe(A, False)
+    assert torch.equal(A, expect_2)
+
+
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index 68b65fcd4..e14cece98 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -23,9 +23,8 @@ def _resolve_warps_per_group(warps_per_group: Optional[int]) -> int:
 
 @tilelang.jit(out_idx=[-1])
 def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
+    def laneid_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_lane_idx(warp_size)
@@ -35,9 +34,8 @@ def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx_sync(warp_size)
@@ -47,9 +45,8 @@ def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx(warp_size)
@@ -63,9 +60,8 @@ def _get_warp_group_idx_kernel(
     warp_size: Optional[int] = None,
     warps_per_group: Optional[int] = None,
 ):
-
     @T.prim_func
-    def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_group_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_group_idx(warp_size, warps_per_group)
@@ -75,9 +71,8 @@ def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _shuffle_elect_kernel(num_threads: int = 128, thread_extent: int = 64):
-
     @T.prim_func
-    def shuffle_elect_kernel(A: T.Tensor((num_threads,), "int32")):
+    def shuffle_elect_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             elected = T.shuffle_elect(thread_extent)
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index b3550f589..c81a241ba 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -4,13 +4,14 @@
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_if_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,7 +27,7 @@ def main(
     return main
 
 
-def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_if_range(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 0779bff57..a33a616b3 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -1,11 +1,11 @@
 import torch
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 
 
 @tilelang.jit(out_idx=-1)
 def get_inf_kernel(dtype: str):
-
     @T.prim_func
     def main(A: T.Tensor((32,), dtype)):
         with T.Kernel(1, threads=32):
@@ -18,15 +18,15 @@ def _test_infinity(dtype: str):
     kernel = get_inf_kernel(dtype)
     output = kernel()
 
-    assert torch.all(output == torch.inf), f'check failed for {dtype=}'
+    assert torch.all(output == torch.inf), f"check failed for {dtype=}"
 
 
 @tilelang.testing.requires_cuda
 def test_infinity():
-    _test_infinity("float16")
-    _test_infinity("bfloat16")
-    _test_infinity("float32")
-    _test_infinity("float64")
+    _test_infinity(T.float16)
+    _test_infinity(T.bfloat16)
+    _test_infinity(T.float32)
+    _test_infinity(T.float64)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_int64.py b/testing/python/language/test_tilelang_language_int64.py
new file mode 100644
index 000000000..d81e9dc6f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_int64.py
@@ -0,0 +1,66 @@
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit
+def fill_symbolic(value: float, dtype=T.bfloat16):
+    n = T.symbolic("n", "int64")
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_symbolic(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_symbolic(1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_symbolic():
+    # Requires 8GB VRAM
+    run_fill_symbolic(2**32)
+
+
+@tilelang.jit
+def fill_static(n: int, value: float, dtype=T.bfloat16):
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_static(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_static(n, 1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_static():
+    # Requires 8GB VRAM
+    run_fill_static(2**32)
+
+
+if __name__ == "__main__":
+    test_fill_symbolic()
+    test_fill_static()
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
new file mode 100644
index 000000000..b1d1e5401
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((N,), T.float32),
+        y: T.Tensor((N,), T.float32),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ldg.py b/testing/python/language/test_tilelang_language_ldg.py
new file mode 100644
index 000000000..47d82c52d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_ldg.py
@@ -0,0 +1,269 @@
+"""Tests for load_global_32, load_global_64, load_global_128, load_global_256 intrinsics codegen using eager jit style."""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_ldg32_codegen():
+    """Test that ldg32 generates tl::load_global_32 in CUDA source."""
+
+    @tilelang.jit
+    def ldg32_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            Y[pid] = T.reinterpret(T.ldg32(X[pid]), T.float32)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg32_kernel(X, Y)
+    src = ldg32_kernel.get_kernel_source(N=128)
+    print("=== ldg32 codegen ===")
+    print(src)
+    # Verify codegen
+    assert "load_global_32" in src, "Expected load_global_32 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg64_codegen():
+    """Test that ldg64 generates tl::load_global_64 in CUDA source."""
+
+    @tilelang.jit
+    def ldg64_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            Y[pid * 2 : pid * 2 + 2] = T.reinterpret(T.ldg64(X[pid * 2 : pid * 2 + 2]), T.float32x2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg64_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg64_kernel.get_kernel_source(N=128)
+    print("=== ldg64 codegen ===")
+    print(src)
+    assert "load_global_64" in src, "Expected load_global_64 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg128_codegen():
+    """Test that ldg128 generates tl::load_global_128 in CUDA source."""
+
+    @tilelang.jit
+    def ldg128_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            Y[pid * 4 : pid * 4 + 4] = T.reinterpret(T.ldg128(X[pid * 4 : pid * 4 + 4]), T.float32x4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg128_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg128_kernel.get_kernel_source(N=128)
+    print("=== ldg128 codegen ===")
+    print(src)
+    assert "load_global_128" in src, "Expected load_global_128 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_ldg256_codegen():
+    """Test that ldg256 generates tl::load_global_256 in CUDA source."""
+
+    @tilelang.jit
+    def ldg256_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            Y[pid * 8 : pid * 8 + 8] = T.reinterpret(T.ldg256(X[pid * 8 : pid * 8 + 8]), T.float32x8)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.empty(256, dtype=torch.float32, device="cuda")
+
+    ldg256_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg256_kernel.get_kernel_source(N=256)
+    print("=== ldg256 codegen ===")
+    print(src)
+    assert "load_global_256" in src, "Expected load_global_256 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg32_predicated_codegen():
+    """Test that ldg32 with predicate generates tl::load_global_32_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg32_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid] = T.reinterpret(T.ldg32(X[pid], pred=pid < N // 2), T.float32)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg32_pred_kernel(X, Y)
+    src = ldg32_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg32 predicated codegen ===")
+    print(src)
+    # Verify codegen - should have load_global_32_conditional with two arguments and non-trivial predicate
+    assert "load_global_32_conditional" in src, "Expected load_global_32_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg64_predicated_codegen():
+    """Test that ldg64 with predicate generates tl::load_global_64_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg64_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 2 : pid * 2 + 2] = T.reinterpret(T.ldg64(X[pid * 2 : pid * 2 + 2], pred=pid < N // 4), T.float32x2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg64_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg64_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg64 predicated codegen ===")
+    print(src)
+    assert "load_global_64_conditional" in src, "Expected load_global_64_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg128_predicated_codegen():
+    """Test that ldg128 with predicate generates tl::load_global_128_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg128_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 4 : pid * 4 + 4] = T.reinterpret(T.ldg128(X[pid * 4 : pid * 4 + 4], pred=pid < N // 8), T.float32x4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg128_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg128_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg128 predicated codegen ===")
+    print(src)
+    assert "load_global_128_conditional" in src, "Expected load_global_128_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_ldg256_predicated_codegen():
+    """Test that ldg256 with predicate generates tl::load_global_256_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg256_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 8 : pid * 8 + 8] = T.reinterpret(T.ldg256(X[pid * 8 : pid * 8 + 8], pred=pid < N // 16), T.float32x8)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(256, dtype=torch.float32, device="cuda")
+
+    ldg256_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg256_pred_kernel.get_kernel_source(N=256)
+    print("=== ldg256 predicated codegen ===")
+    print(src)
+    assert "load_global_256_conditional" in src, "Expected load_global_256_conditional call in generated CUDA source"
+    # Verify correctness
+    Y_ref = torch.zeros(256, dtype=torch.float32, device="cuda")
+    for i in range(256):
+        if i < 128:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index a2af09c67..e1f3f394b 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -3,11 +3,11 @@
 from tilelang import language as T
 
 
+@tilelang.testing.requires_cuda
 def test_let_vectorize_load():
-
     @T.prim_func
     def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
 
         for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
             for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
diff --git a/testing/python/language/test_tilelang_language_let_layout.py b/testing/python/language/test_tilelang_language_let_layout.py
new file mode 100644
index 000000000..fec30b914
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_let_layout.py
@@ -0,0 +1,123 @@
+"""
+Test layout inference for LetStmt expressions.
+
+This test validates that TileLang correctly handles layout inference when
+fragment buffer accesses occur through let bindings. For example:
+
+    block_mask_f = T.alloc_fragment((N_S,), T.int32)
+    T.copy(BlockMask[by, :], block_mask_f)
+    for i in T.Pipelined(N_S):
+        a = block_mask_f[i]  # LetStmt: a is bound to fragment buffer load
+        T.copy(A[a, 0], A_shared)  # a is used as index in TMA copy
+
+Key scenarios tested:
+1. Fragment buffer layout inference through let bindings
+2. TMA (Tensor Memory Accelerator) copy with let-bound indices
+3. CP.ASYNC copy with let-bound indices
+4. Warp specialization with let-bound fragment accesses
+"""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def blocksparse_copy_kernel(M, N, N_S, block_M, block_N, dtype=T.float16):
+    """BlockSparse copy kernel using fragment for block mask indices."""
+    block_mask_shape = (M // block_M, N_S)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)
+            block_mask_f = T.alloc_fragment((N_S,), T.int32)
+
+            T.clear(B_shared)
+            T.copy(BlockMask[by, :], block_mask_f)
+            for i in T.Pipelined(N_S):
+                a = block_mask_f[i]  # LetStmt: fragment buffer access
+                if a >= 0:
+                    T.copy(A[a, 0], A_shared)
+                    T.copy(A_shared, B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N])
+
+    return main
+
+
+def ref_blocksparse_copy(A, B, BlockMask, M, N, N_S, block_M, block_N):
+    """Reference implementation for blocksparse copy."""
+    ref_B = B.clone()
+    num_row_blocks = M // block_M
+
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            src_row_start = BlockMask[by, i].item()
+            ref_B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N] = A[
+                src_row_start : src_row_start + block_M, 0:block_N
+            ]
+
+    return ref_B
+
+
+def run_blocksparse_copy(M, N, block_M, block_N, pass_configs=None):
+    """Run blocksparse copy test with given parameters."""
+    N_S = N // block_N
+
+    program = blocksparse_copy_kernel(M, N, N_S, block_M, block_N)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs=pass_configs or {},
+    )
+
+    # Initialize tensors
+    a = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    b = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+
+    # Create BlockMask with valid row indices
+    num_row_blocks = M // block_M
+    block_mask = torch.zeros((num_row_blocks, N_S), dtype=torch.int32, device="cuda")
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            max_row_block = (M - block_M) // block_M
+            block_mask[by, i] = torch.randint(0, max_row_block + 1, (1,)).item() * block_M
+
+    # Run kernel
+    c = kernel(a, block_mask)
+
+    # Compute reference
+    ref_c = ref_blocksparse_copy(a, b, block_mask, M, N, N_S, block_M, block_N)
+
+    # Verify
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_tma():
+    """Test blocksparse copy with TMA (Tensor Memory Accelerator)."""
+    run_blocksparse_copy(M=1024, N=1024, block_M=128, block_N=128, pass_configs={})
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_cp_async():
+    """Test blocksparse copy with CP.ASYNC (without TMA)."""
+    run_blocksparse_copy(
+        M=1024,
+        N=1024,
+        block_M=128,
+        block_N=128,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index ad90785f4..cd899a606 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -5,12 +5,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -27,16 +26,9 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -48,12 +40,11 @@ def test_tilelang_copy_mask_parallel():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -69,16 +60,9 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -90,12 +74,11 @@ def test_tilelang_copy_mask_copy():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -112,20 +95,9 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_parallel_range(M=1024,
-                                          N=1024,
-                                          block_M=128,
-                                          block_N=128,
-                                          dtype="float16"):
+def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,12 +109,11 @@ def test_tilelang_copy_mask_parallel_range():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -158,16 +129,9 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_negative_index.py b/testing/python/language/test_tilelang_language_negative_index.py
index 4a0df878b..feeed2c6f 100644
--- a/testing/python/language/test_tilelang_language_negative_index.py
+++ b/testing/python/language/test_tilelang_language_negative_index.py
@@ -1,38 +1,37 @@
 from tilelang import tvm
 import tilelang as tl
 import tilelang.testing
-from tvm.script import tir as T
+import tilelang.language as T
 
 
 @T.prim_func
-def negative_index_before(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_before(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(-1)]
 
 
 @T.prim_func
-def negative_index_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(15)]
 
 
 @T.prim_func
-def negative_index_loop_before(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_before(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[-i - 1]
 
 
 @T.prim_func
-def negative_index_loop_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[15 - i]
 
 
 @T.prim_func
-def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), "float32"),
-                                   B: T.Buffer((16,), "float32")):
+def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(16):
         B[i] = A[shift + i]
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index b51ca8b68..a392e70b6 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -8,12 +8,11 @@
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_static(length=256, dtype="float32"):
-
+def parallel_elementwise_static(length=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length):
@@ -23,13 +22,12 @@ def main(
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_dynamic(max_len=512, threads=256, dtype="float32"):
-
+def parallel_elementwise_dynamic(max_len=512, threads=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((max_len,), dtype),
-            B: T.Tensor((max_len,), dtype),
-            valid_len: T.int32,
+        A: T.Tensor((max_len,), dtype),
+        B: T.Tensor((max_len,), dtype),
+        valid_len: T.int32,
     ):
         with T.Kernel(1, threads=threads) as _:
             for i in T.Parallel(max_len):
diff --git a/testing/python/language/test_tilelang_language_pdl.py b/testing/python/language/test_tilelang_language_pdl.py
new file mode 100644
index 000000000..3f9a3d782
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_pdl.py
@@ -0,0 +1,58 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def kernels_with_pdl_trigger(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+            for i in T.Parallel(block_size):
+                idx = bx * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] + 1.0
+            T.pdl_trigger()
+
+    return main
+
+
+def kernels_with_pdl_sync(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+            T.pdl_sync()
+            for i in T.Parallel(block_size):
+                idx = bx2 * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] * 2.0
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_pdl_trigger():
+    N = 64
+    program = kernels_with_pdl_trigger(N)
+
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaTriggerProgrammaticLaunchCompletion" in code
+
+
+@tilelang.testing.requires_cuda
+def test_pdl_sync():
+    N = 64
+    program = kernels_with_pdl_sync(N)
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaGridDependencySynchronize" in code
+    assert "__restrict__" not in code
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 212f281ea..8136e246f 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -23,13 +24,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -63,9 +62,9 @@ def run_gemm(
     block_K = 32
     trans_A = False
     trans_B = False
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
     program = matmul(
         M,
@@ -90,7 +89,8 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -100,11 +100,11 @@ def ref_program(A, B):
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -124,27 +124,19 @@ def test_pipeline_order_stage():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+    },
+)
+def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype=T.float16, accum_dtype=T.float32):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     import tilelang.language as T
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -183,8 +175,7 @@ def run_blocksparse_matmul(num_stages):
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
 
-    kernel = blocksparse_matmul(
-        M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
+    kernel = blocksparse_matmul(M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
     print(kernel.get_kernel_source())
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -200,12 +191,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
                 accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
                 for k in range(K // block_K):
                     if BlockMask[i, j, k]:
-                        accu += (
-                            A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                                torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                                   j * block_N:(j + 1) * block_N].to(torch.float32))
-                ref_c[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                        accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                            k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                        ].to(torch.float32)
+                ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
         return ref_c
 
     # Compute the reference result using the naive PyTorch implementation
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index e4659ecc5..dd167efe5 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -6,8 +6,7 @@
 from tilelang.utils import map_torch_type
 
 
-def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a_ptr: T.ptr,
@@ -40,23 +39,23 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
+    cython_jit_kernel = tl.compile(program, execution_backend="cython")
+    ffi_jit_kernel = tl.compile(program, execution_backend="tvm_ffi")
 
     def ref_program(a, b):
         return (a @ b.T).to(torch.float32)
 
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
+    ffi_c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
+    cython_c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
 
-    c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
-
-    jit_kernel(a.data_ptr(), b.data_ptr(), c.data_ptr(), M, N, K)
-
-    ref_c = (a @ b.T).to(map_torch_type(accum_dtype))
-
-    torch.testing.assert_close(c, ref_c, atol=1e-2, rtol=1e-2)
+    ffi_jit_kernel(a, b, ffi_c, M, N, K)
+    cython_jit_kernel(a.data_ptr(), b.data_ptr(), cython_c.data_ptr(), M, N, K)
+    torch.testing.assert_close(ffi_c, ref_program(a, b), atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(cython_c, ffi_c, atol=1e-2, rtol=1e-2)
 
 
 def test_matmul():
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
new file mode 100644
index 000000000..bc2a07eba
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -0,0 +1,69 @@
+import tilelang
+import tilelang.language as T
+import torch
+import pytest
+import tilelang.testing
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42, generator="curandStatePhilox4_32_10_t"):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(
+        A: T.Tensor((M,), "uint32"),
+        B: T.Tensor((M,), "float32"),
+        C: T.Tensor((M,), "float64"),
+        D: T.Tensor((M,), "float32"),
+        E: T.Tensor((M,), "float64"),
+    ):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread, generator=generator)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    B[idx] = T.rng_rand_float()
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    C[idx] = T.rng_rand_float(bit=64)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    D[idx] = T.rng_rand_float(dist="normal")
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    E[idx] = T.rng_rand_float(bit=64, dist="normal")
+
+    return rand_kernel
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, seed, generator", [(1024, 42, "curandStateMRG32k3a_t"), (512, 123, "curandStatePhilox4_32_10_t"), (128, 0, "curandStateXORWOW_t")]
+)
+def test_rand_1d(M, seed, generator):
+    kernel = tilelang_rand_1d(M, seed, generator)
+    A = torch.empty(M, dtype=torch.uint32, device="cuda")
+    B = torch.empty(M, dtype=torch.float32, device="cuda")
+    C = torch.empty(M, dtype=torch.float64, device="cuda")
+    D = torch.empty(M, dtype=torch.float32, device="cuda")
+    E = torch.empty(M, dtype=torch.float64, device="cuda")
+    kernel(A, B, C, D, E)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index cecfaa097..f12c5bc4a 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -1,17 +1,16 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import tilelang.language as T
 
 tilelang.testing.set_random_seed()
 
 
 def _make_shared_reduce(M, N, dtype, reduce_cb):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((M, N), dtype)
@@ -30,13 +29,13 @@ def _run_program(program, ref_program, atol=1e-2, rtol=1e-2):
     profiler.assert_allclose(ref_program, atol=atol, rtol=rtol)
 
 
-def reduce_max_test(M, N, dtype="float16"):
+def reduce_max_test(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -49,13 +48,13 @@ def main(
     return main
 
 
-def reduce_sum_test(M, N, dtype="float32"):
+def reduce_sum_test(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -68,27 +67,27 @@ def main(
     return main
 
 
-def reduce_sum_ss(M, N, dtype="float32"):
+def reduce_sum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_sum(src, dst, dim=1))
 
 
-def reduce_max_ss(M, N, dtype="float32"):
+def reduce_max_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_max(src, dst, dim=1))
 
 
-def reduce_min_ss(M, N, dtype="float32"):
+def reduce_min_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_min(src, dst, dim=1))
 
 
-def reduce_abssum_ss(M, N, dtype="float32"):
+def reduce_abssum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_abssum(src, dst, dim=1))
 
 
-def reduce_absmax_ss(M, N, dtype="float32"):
+def reduce_absmax_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_absmax(src, dst, dim=1))
 
 
-def run_reduce_sum(M, N, dtype="float32", mode="rr"):
+def run_reduce_sum(M, N, dtype=T.float32, mode="rr"):
     if mode == "rr":
         program = reduce_sum_test(M, N, dtype)
     elif mode == "ss":
@@ -98,12 +97,12 @@ def run_reduce_sum(M, N, dtype="float32", mode="rr"):
     _run_program(program, lambda A: A.sum(dim=1))
 
 
-def run_shared_reduce(program_builder, ref_program, M, N, dtype="float32"):
+def run_shared_reduce(program_builder, ref_program, M, N, dtype=T.float32):
     program = program_builder(M, N, dtype)
     _run_program(program, ref_program)
 
 
-def run_reduce_max(M, N, dtype="float16"):
+def run_reduce_max(M, N, dtype=T.float16):
     program = reduce_max_test(M, N, dtype)
     _run_program(program, lambda A: A.max(dim=1).values, atol=1e-2, rtol=1e-2)
 
@@ -119,34 +118,34 @@ def test_reduce_sum_shared():
 
 
 def test_reduce_max():
-    run_reduce_max(256, 256, "float16")
-    run_reduce_max(512, 128, "float16")
-    run_reduce_max(256, 256, "float32")
+    run_reduce_max(256, 256, T.float16)
+    run_reduce_max(512, 128, T.float16)
+    run_reduce_max(256, 256, T.float32)
 
 
 def test_reduce_max_shared():
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_min_shared():
-    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_abssum_shared():
-    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, "float32")
+    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, T.float32)
 
 
 def test_reduce_absmax_shared():
-    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, T.float32)
 
 
-def reduce_sum_test_clear(M, N, dtype="float32"):
+def reduce_sum_test_clear(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -160,8 +159,8 @@ def main(
     return main
 
 
-def run_reduce_sum_clear(M, N, dtype="float32"):
-    program = reduce_sum_test_clear(M, N, dtype)
+def run_reduce_sum_clear(M, N, dtype=T.float32, tl_func=reduce_sum_test_clear):
+    program = tl_func(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
     def ref_program(A):
@@ -176,18 +175,18 @@ def ref_program(A):
 
 
 def test_reduce_sum_clear():
-    run_reduce_sum_clear(256, 256, "float32")
-    run_reduce_sum_clear(512, 128, "float32")
-    run_reduce_sum_clear(128, 512, "float32")
+    run_reduce_sum_clear(256, 256, T.float32)
+    run_reduce_sum_clear(512, 128, T.float32)
+    run_reduce_sum_clear(128, 512, T.float32)
 
 
-def reduce_max_test_clear(M, N, dtype="float16"):
+def reduce_max_test_clear(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -201,7 +200,7 @@ def main(
     return main
 
 
-def run_reduce_max_clear(M, N, dtype="float16"):
+def run_reduce_max_clear(M, N, dtype=T.float16):
     program = reduce_max_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -217,7 +216,55 @@ def ref_program(A):
 
 
 def test_reduce_max_clear():
-    run_reduce_max_clear(256, 256, "float16")
+    run_reduce_max_clear(256, 256, T.float16)
+
+
+def reduce_sum_test_clear_B_shared(M, N, dtype=T.float32):
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_local = T.alloc_fragment((M, N), dtype)
+            B_shared = T.alloc_shared((M,), dtype)
+
+            T.copy(A, A_local)
+            T.fill(B_shared, 1)
+            T.reduce_sum(A_local, B_shared, dim=1, clear=False)
+            T.copy(B_shared, B)
+
+    return main
+
+
+def test_reduce_sum_clear_B_shared():
+    run_reduce_sum_clear(256, 256, T.float32, reduce_sum_test_clear_B_shared)
+
+
+def reduce_sum_test_clear_AB_shared(M, N, dtype=T.float32):
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((M, N), dtype)
+            B_shared = T.alloc_shared((M,), dtype)
+
+            T.copy(A, A_shared, disable_tma=True)
+            T.fill(B_shared, 1)
+            T.reduce_sum(A_shared, B_shared, dim=1, clear=False)
+            T.copy(B_shared, B)
+
+    return main
+
+
+def test_reduce_sum_clear_AB_shared():
+    run_reduce_sum_clear(64, 64, T.float32, reduce_sum_test_clear_AB_shared)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index fa7b2a43f..10c3d0ce8 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -1,15 +1,15 @@
-from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+from tilelang import language as T
+import torch
+import pytest
 
 
 def reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_reshaped = T.reshape(A, [N // M, M])
@@ -28,7 +28,8 @@ def run_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -39,17 +40,15 @@ def ref_program(A):
 
 def test_reshape_smem():
     # Test reshape
-    run_reshape(1024, 32, "float32")
-    run_reshape(2048, 64, "float16")
+    run_reshape(1024, 32, T.float32)
+    run_reshape(2048, 64, T.float16)
 
 
 def reshape_test_smem_1d_2_2d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N,), dtype)
@@ -72,7 +71,8 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -82,17 +82,15 @@ def ref_program(A):
 
 
 def test_reshape_smem_1d_2_2d():
-    run_reshape_smem_1d_2_2d(1024, 32, "float32")
-    run_reshape_smem_1d_2_2d(2048, 64, "float16")
+    run_reshape_smem_1d_2_2d(1024, 32, T.float32)
+    run_reshape_smem_1d_2_2d(2048, 64, T.float16)
 
 
 def reshape_test_smem_2d_2_1d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N // M, M), dtype)
@@ -115,7 +113,8 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -125,8 +124,158 @@ def ref_program(A):
 
 
 def test_reshape_smem_2d_2_1d():
-    run_reshape_smem_2d_2_1d(1024, 32, "float32")
-    run_reshape_smem_2d_2_1d(2048, 64, "float16")
+    run_reshape_smem_2d_2_1d(1024, 32, T.float32)
+    run_reshape_smem_2d_2_1d(2048, 64, T.float16)
+
+
+def reshape_fragment_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
+            A_local = T.alloc_fragment((N // M, M), dtype)
+            B_shared = T.alloc_shared((N,), dtype, scope="shared")
+
+            T.copy(A, A_shared)
+            T.copy(A_shared, A_local)
+            A_local_reshape = T.reshape(A_local, [N])
+            T.copy(A_local_reshape, B_shared)
+            T.copy(B_shared, B)
+
+    return main
+
+
+def run_reshape_fragment(N, M, dtype):
+    program = reshape_fragment_test(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return A.reshape(N)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reshape_fragment():
+    run_reshape_fragment(1024, 32, T.float32)
+    run_reshape_fragment(2048, 64, T.float16)
+
+
+def reshape_layout_transform_shared(N, M, dtype):
+    from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
+
+            T.annotate_layout(
+                {
+                    A_shared: make_mma_swizzle_layout(A_shared),
+                }
+            )
+            T.copy(A, A_shared)
+            A_shared_reshape = T.reshape(A_shared, [N])
+            T.copy(A_shared_reshape, B)
+
+    return main
+
+
+def run_reshape_layout_transform_shared(N, M, dtype):
+    program = reshape_layout_transform_shared(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return A.reshape(N)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reshape_layout_transform_shared():
+    run_reshape_layout_transform_shared(1024, 32, T.float32)
+    run_reshape_layout_transform_shared(2048, 64, T.float16)
+
+
+def reduce_after_reshape_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N,), dtype, scope="shared")
+            A_local = T.alloc_fragment((N,), dtype)
+            B_local = T.alloc_fragment((N // M,), dtype)
+
+            T.copy(A, A_shared)
+            T.copy(A_shared, A_local)
+            A_local_reshape = T.reshape(A_local, [N // M, M])
+            T.reduce_max(A_local_reshape, B_local, dim=1)
+            T.copy(B_local, B)
+
+    return main
+
+
+def run_reduce_after_reshape(N, M, dtype):
+    program = reduce_after_reshape_test(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return torch.max(A.reshape(N // M, M), dim=1).values
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reduce_after_reshape():
+    run_reduce_after_reshape(1024, 32, T.float32)
+    run_reduce_after_reshape(2048, 64, T.float16)
+
+
+def reshape_shape_mismatch_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_reshaped = T.reshape(A, [N // M, M + 1])
+            T.copy(A_reshaped, B)
+
+    return main
+
+
+def test_reshape_shape_mismatch():
+    with pytest.raises(AssertionError):
+        reshape_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_select.py b/testing/python/language/test_tilelang_language_select.py
new file mode 100644
index 000000000..bbb645dca
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_select.py
@@ -0,0 +1,75 @@
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def get_select_kernel_1():
+    @T.prim_func
+    def main(
+        A: T.Tensor[(128, 8), T.float32],
+        B: T.Tensor[(128, 8), T.float32],
+    ):
+        with T.Kernel(1, threads=128):
+            tx = T.get_thread_binding(0)
+            tmp = T.alloc_var(T.bfloat16)
+            tmp = A[tx, 3]
+
+            B[tx, 0] = T.Select(True, A[tx, 0], 0.0)
+            B[tx, 1] = T.Select(False, 1.0, A[tx, 1])
+            B[tx, 2] = T.Select(T.cast(A[tx, 3], T.bfloat16) == tmp, A[tx, 2], T.cast(tmp, T.float32))
+            B[tx, 3] = T.Select(B[tx, 0] != 0.0, T.if_then_else(B[tx, 1] != 0.0, A[tx, 3], 0.0), 0.0)
+
+            for i in T.serial(4):
+                B[tx, i + 4] = T.Select(
+                    A[tx, 0] == 0, T.if_then_else(T.Select(True, False, True), 1.0, 2.0), T.Select(True, A[tx, i + 4], 3.0)
+                )
+
+    return main
+
+
+def test_select_correctness():
+    A = torch.randn((128, 8), dtype=torch.float32, device="cuda")
+    B = torch.empty((128, 8), dtype=torch.float32, device="cuda")
+    kernel = get_select_kernel_1()
+
+    A = torch.clamp(A, min=1e-4)
+    kernel(A, B)
+    assert torch.allclose(A, B)
+
+
+@tilelang.jit
+def get_select_kernel_2():
+    @T.prim_func
+    def main(
+        A: T.Tensor[(128, 8), T.float32],
+        B: T.Tensor[(128, 8), T.float32],
+    ):
+        with T.Kernel(1, threads=128):
+            tx = T.get_thread_binding(0)
+            tmp = T.alloc_var(T.bfloat16)
+            tmp = A[tx, 3]
+
+            B[tx, 0] = T.Select(True, A[tx, 0], 0.0)
+            B[tx, 1] = T.Select(False, 1.0, A[tx, 1])
+            B[tx, 2] = T.Select(T.cast(A[tx, 3], T.bfloat16) == tmp, A[tx, 2], T.cast(tmp, T.float32))
+            B[tx, 3] = T.Select(B[tx, 0] != 0.0, T.Select(B[tx, 1] != 0.0, A[tx, 3], 0.0), 0.0)
+
+            for i in T.serial(4):
+                B[tx, i + 4] = T.Select(
+                    A[tx, 0] == 0,
+                    T.Select(T.Select(True, False, True), 1.0, 2.0),
+                    T.Select(T.sin(A[tx, 2]) == 1.0, T.sin(A[tx, 0]), T.cos(A[tx, 1])),
+                )
+
+    return main
+
+
+def test_select_codegen_no_if():
+    kernel = get_select_kernel_2()
+    source = kernel.get_kernel_source()
+    assert "if (" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_stg.py b/testing/python/language/test_tilelang_language_stg.py
new file mode 100644
index 000000000..5ef1ae32d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_stg.py
@@ -0,0 +1,234 @@
+"""Tests for store_global_32, store_global_64, store_global_128, store_global_256 intrinsics codegen using eager jit style."""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_stg32_codegen():
+    """Test that stg32 generates tl::store_global_32 in CUDA source."""
+
+    @tilelang.jit
+    def stg32_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            val = T.reinterpret(X[pid], T.uint32)
+            T.stg32(Y[pid], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg32_kernel(X, Y)
+    src = stg32_kernel.get_kernel_source(N=128)
+    print("=== stg32 codegen ===")
+    print(src)
+    # Verify codegen
+    assert "store_global_32" in src, "Expected store_global_32 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg64_codegen():
+    """Test that stg64 generates tl::store_global_64 in CUDA source."""
+
+    @tilelang.jit
+    def stg64_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            val = T.ldg64(X[pid * 2 : pid * 2 + 2])
+            T.stg64(Y[pid * 2 : pid * 2 + 2], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg64_kernel(X, Y)
+
+    # Verify codegen
+    src = stg64_kernel.get_kernel_source(N=128)
+    print("=== stg64 codegen ===")
+    print(src)
+    assert "store_global_64" in src, "Expected store_global_64 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg128_codegen():
+    """Test that stg128 generates tl::store_global_128 in CUDA source."""
+
+    @tilelang.jit
+    def stg128_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            val = T.ldg128(X[pid * 4 : pid * 4 + 4])
+            T.stg128(Y[pid * 4 : pid * 4 + 4], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg128_kernel(X, Y)
+
+    # Verify codegen
+    src = stg128_kernel.get_kernel_source(N=128)
+    print("=== stg128 codegen ===")
+    print(src)
+    assert "store_global_128" in src, "Expected store_global_128 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_stg256_codegen():
+    """Test that stg256 generates tl::store_global_256 in CUDA source."""
+
+    @tilelang.jit
+    def stg256_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            val = T.ldg256(X[pid * 8 : pid * 8 + 8])
+            T.stg256(Y[pid * 8 : pid * 8 + 8], val)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.empty(256, dtype=torch.float32, device="cuda")
+
+    stg256_kernel(X, Y)
+
+    # Verify codegen
+    src = stg256_kernel.get_kernel_source(N=256)
+    print("=== stg256 codegen ===")
+    print(src)
+    assert "store_global_256" in src, "Expected store_global_256 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg32_predicated_codegen():
+    """Test that stg32 with predicate generates tl::store_global_32_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg32_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            val = T.reinterpret(X[pid], T.uint32)
+            # Only store for the first half of elements
+            T.stg32(Y[pid], val, pred=pid < N // 2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg32_pred_kernel(X, Y)
+    src = stg32_pred_kernel.get_kernel_source(N=128)
+    print("=== stg32 predicated codegen ===")
+    print(src)
+    # Verify codegen - should have store_global_32 with predicate
+    assert "store_global_32" in src, "Expected store_global_32 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_stg64_predicated_codegen():
+    """Test that stg64 with predicate generates tl::store_global_64_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg64_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            val = T.ldg64(X[pid * 2 : pid * 2 + 2])
+            # Only store for the first half of elements
+            T.stg64(Y[pid * 2 : pid * 2 + 2], val, pred=pid < N // 4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg64_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg64_pred_kernel.get_kernel_source(N=128)
+    print("=== stg64 predicated codegen ===")
+    print(src)
+    assert "store_global_64" in src, "Expected store_global_64 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_stg128_predicated_codegen():
+    """Test that stg128 with predicate generates tl::store_global_128_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg128_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            val = T.ldg128(X[pid * 4 : pid * 4 + 4])
+            # Only store for the first half of elements
+            T.stg128(Y[pid * 4 : pid * 4 + 4], val, pred=pid < N // 8)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg128_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg128_pred_kernel.get_kernel_source(N=128)
+    print("=== stg128 predicated codegen ===")
+    print(src)
+    assert "store_global_128" in src, "Expected store_global_128 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_stg256_predicated_codegen():
+    """Test that stg256 with predicate generates tl::store_global_256_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg256_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            val = T.ldg256(X[pid * 8 : pid * 8 + 8])
+            # Only store for the first half of elements
+            T.stg256(Y[pid * 8 : pid * 8 + 8], val, pred=pid < N // 16)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(256, dtype=torch.float32, device="cuda")
+
+    stg256_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg256_pred_kernel.get_kernel_source(N=256)
+    print("=== stg256 predicated codegen ===")
+    print(src)
+    assert "store_global_256" in src, "Expected store_global_256 call in generated CUDA source"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_subtype.py b/testing/python/language/test_tilelang_language_subtype.py
new file mode 100644
index 000000000..ef0641a9d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_subtype.py
@@ -0,0 +1,277 @@
+"""Tests for subtype (subbyte dtype like 4bit int or fp) shape and stride bindings ."""
+
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def basic_shape_kernel(x):
+    m = T.dynamic("m")
+    x: T.Tensor[(m, 16), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def strided_kernel(x):
+    m = T.dynamic("m")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[m, 16], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_basic_shape_binding():
+    """Test that symbolic shape variables are correctly bound for subtype buffers.
+
+    For fp4 (4 bits), pack_factor = 8 / 4 = 2.
+    Logical shape [m, 16] corresponds to runtime shape [m, 8].
+    The symbolic variable 'm' should be bound from runtime_shape[0].
+    """
+    # Runtime shape [4, 8] -> Logical shape [4, 16] for fp4
+    t = torch.randint(0, 256, (4, 8), dtype=torch.uint8, device="cuda")
+    basic_shape_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_stride_binding():
+    """Test that symbolic stride variables are correctly bound for subtype buffers.
+
+    For fp4, the stride relationship is:
+    - Last dimension: logical_stride = runtime_stride
+    - Other dimensions: logical_stride = runtime_stride * pack_factor
+
+    With pack_factor = 2:
+    - Runtime stride [8, 1] -> Logical stride [16, 1]
+    """
+    # Contiguous tensor: runtime stride [8, 1] -> logical stride [16, 1]
+    t = torch.randint(0, 256, (4, 8), dtype=torch.uint8, device="cuda")
+    strided_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_noncontiguous_tensor():
+    """Test subtype with non-contiguous (strided) tensor.
+
+    Create a tensor with stride [16, 1] (by slicing every other row).
+    This corresponds to logical stride [32, 1] for fp4.
+    """
+    # Create a larger tensor and slice to get non-contiguous strides
+    t_large = torch.randint(0, 256, (8, 8), dtype=torch.uint8, device="cuda")
+    # Slice every other row: shape [4, 8] but stride [16, 1]
+    t_noncontig = t_large[::2, :]
+    assert t_noncontig.shape == (4, 8)
+    assert t_noncontig.stride() == (16, 1)
+
+    strided_kernel(t_noncontig)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_different_m_values():
+    """Test subtype binding with different values of symbolic variable m."""
+    for m in [1, 2, 4, 8, 16, 32]:
+        # Runtime shape [m, 8] -> Logical shape [m, 16] for fp4
+        t = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+        basic_shape_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_different_strides():
+    """Test subtype stride binding with different stride values."""
+    # Test with different non-contiguous strides
+    for stride_multiplier in [1, 2, 4]:
+        # Create tensor with specific stride pattern
+        t_large = torch.randint(0, 256, (4 * stride_multiplier, 8), dtype=torch.uint8, device="cuda")
+        # Slice to get stride [8 * stride_multiplier, 1]
+        t_strided = t_large[::stride_multiplier, :]
+        assert t_strided.shape == (4, 8)
+        assert t_strided.stride() == (8 * stride_multiplier, 1)
+
+        strided_kernel(t_strided)
+
+
+@tilelang.jit
+def symbolic_last_dim_kernel(x):
+    """Kernel with symbolic variable in the last dimension."""
+    n = T.dynamic("n")
+    x: T.Tensor[(4, n), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def symbolic_last_dim_strided_kernel(x):
+    """Kernel with symbolic variable in both shape and stride of last dimension."""
+    n = T.dynamic("n")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[4, n], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def shared_symbolic_kernel(x, y):
+    """Kernel with shared symbolic variable across multiple buffers."""
+    m = T.dynamic("m")
+    x: T.Tensor[(m, 16), T.float4_e2m1fn]
+    y: T.Tensor[(m * 4, 16), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def shared_symbolic_strided_kernel(x, y):
+    """Kernel with shared symbolic variable in strides."""
+    m = T.dynamic("m")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[m, 16], [s, 1], T.float4_e2m1fn]
+    y: T.StridedTensor[[m * 2, 16], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def complex_expr_kernel(x, y):
+    """Kernel with complex expressions involving symbolic variables."""
+    m = T.dynamic("m")
+    n = T.dynamic("n")
+    x: T.Tensor[(m, n * 2), T.float4_e2m1fn]
+    y: T.Tensor[(m * 2, n), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_symbolic_last_dim():
+    """Test symbolic variable in the last dimension.
+
+    For fp4, the last dimension has pack_factor applied:
+    Logical shape [4, n] with n=32 corresponds to runtime shape [4, 16].
+    So n = runtime_shape[1] * pack_factor = 16 * 2 = 32.
+    """
+    # Runtime shape [4, 16] -> Logical shape [4, 32] for fp4
+    t = torch.randint(0, 256, (4, 16), dtype=torch.uint8, device="cuda")
+    symbolic_last_dim_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_symbolic_last_dim_various_sizes():
+    """Test symbolic last dimension with various sizes."""
+    for n_runtime in [4, 8, 16, 32]:
+        # Logical n = runtime_n * 2 (pack_factor for fp4)
+        t = torch.randint(0, 256, (4, n_runtime), dtype=torch.uint8, device="cuda")
+        symbolic_last_dim_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_symbolic_last_dim_strided():
+    """Test symbolic variable in last dimension with strides.
+
+    Note: For subtype (packed storage), the last dimension stride must be 1
+    since elements are packed together. Column slicing doesn't make sense
+    for packed types.
+    """
+    # Contiguous tensor
+    t = torch.randint(0, 256, (4, 16), dtype=torch.uint8, device="cuda")
+    symbolic_last_dim_strided_kernel(t)
+
+    # Non-contiguous tensor (row slicing only, last dim stride stays 1)
+    t_large = torch.randint(0, 256, (8, 16), dtype=torch.uint8, device="cuda")
+    t_strided = t_large[::2, :]  # shape [4, 16], stride [32, 1]
+    assert t_strided.shape == (4, 16)
+    assert t_strided.stride() == (32, 1)
+    symbolic_last_dim_strided_kernel(t_strided)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_shared_symbolic():
+    """Test shared symbolic variable across multiple buffers.
+
+    x has shape (m, 16), y has shape (m*4, 16).
+    For fp4 with pack_factor=2:
+    - x runtime shape (m, 8)
+    - y runtime shape (m*4, 8)
+
+    If m=2:
+    - x runtime: (2, 8), logical: (2, 16)
+    - y runtime: (8, 8), logical: (8, 16)
+    """
+    for m in [1, 2, 4, 8]:
+        x = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+        y = torch.randint(0, 256, (m * 4, 8), dtype=torch.uint8, device="cuda")
+        shared_symbolic_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_shared_symbolic_strided():
+    """Test shared symbolic variable in strides across multiple buffers.
+
+    x has shape (m, 16) with stride (s, 1)
+    y has shape (m*2, 16) with stride (s, 1)
+    """
+    for m in [2, 4, 8]:
+        # Create contiguous tensors
+        x = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+        y = torch.randint(0, 256, (m * 2, 8), dtype=torch.uint8, device="cuda")
+        shared_symbolic_strided_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_shared_symbolic_strided_noncontig():
+    """Test shared symbolic stride with non-contiguous tensors."""
+    # Create non-contiguous tensors with same stride pattern
+    x_large = torch.randint(0, 256, (8, 8), dtype=torch.uint8, device="cuda")
+    y_large = torch.randint(0, 256, (16, 8), dtype=torch.uint8, device="cuda")
+
+    # Slice to get stride [16, 1] for both
+    x = x_large[::2, :]  # shape (4, 8), stride (16, 1)
+    y = y_large[::2, :]  # shape (8, 8), stride (16, 1)
+
+    assert x.shape == (4, 8)
+    assert y.shape == (8, 8)
+    assert x.stride() == (16, 1)
+    assert y.stride() == (16, 1)
+
+    shared_symbolic_strided_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_complex_expressions():
+    """Test complex expressions with symbolic variables.
+
+    x has shape (m, n*2), y has shape (m*2, n).
+    For fp4 with pack_factor=2:
+    - x logical (m, n*2) -> runtime (m, n)
+    - y logical (m*2, n) -> runtime (m*2, n/2)
+    """
+    # m=4, n=16: x logical (4, 32), y logical (8, 16)
+    # x runtime (4, 16), y runtime (8, 8)
+    m, n = 4, 16
+    x = torch.randint(0, 256, (m, n), dtype=torch.uint8, device="cuda")
+    y = torch.randint(0, 256, (m * 2, n // 2), dtype=torch.uint8, device="cuda")
+    complex_expr_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_complex_expressions_various():
+    """Test complex expressions with various m, n values."""
+    for m, n in [(2, 8), (4, 16), (8, 32)]:
+        # x logical (m, n*2) -> runtime (m, n)
+        # y logical (m*2, n) -> runtime (m*2, n/2)
+        x = torch.randint(0, 256, (m, n), dtype=torch.uint8, device="cuda")
+        y = torch.randint(0, 256, (m * 2, n // 2), dtype=torch.uint8, device="cuda")
+        complex_expr_kernel(x, y)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ternary.py b/testing/python/language/test_tilelang_language_ternary.py
index 821231ab4..20c7b5e77 100644
--- a/testing/python/language/test_tilelang_language_ternary.py
+++ b/testing/python/language/test_tilelang_language_ternary.py
@@ -4,24 +4,24 @@
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_ternary(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = (
-                    A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0)
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0
 
     return main
 
 
-def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_ternary(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
index efb665ba3..9cb79c10c 100644
--- a/testing/python/language/test_tilelang_language_tma_1d.py
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -9,10 +9,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -21,7 +19,7 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -36,7 +34,7 @@ def run_elementwise_add(M, N):
     # Default config
     block_M, block_N = 128, 128
     config = {"block_M": block_M, "block_N": block_N, "threads": 128}
-    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+    kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
new file mode 100644
index 000000000..2adb63855
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -0,0 +1,55 @@
+import tilelang
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_unroll_with_step():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, step=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main)
+    assert "#pragma unroll" in kernel.get_kernel_source()
+
+
+# TODO: unroll factor is not supported on hip, skip.
+@tilelang.testing.requires_cuda
+def test_unroll_with_unroll_factor():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, unroll_factor=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main)
+    assert "#pragma unroll 4" in kernel.get_kernel_source()
+
+
+def test_unroll_with_extent_only():
+    """Test T.unroll with only extent parameter."""
+
+    @tilelang.jit
+    def unroll_kernel():
+        out = T.empty((512,), dtype=T.float32)
+        with T.Kernel(1, threads=512):
+            tid = T.get_thread_binding()
+            for i in T.unroll(tid % 32):
+                out[i] = i
+        return out
+
+    kernel = unroll_kernel.compile()
+    source = kernel.get_kernel_source()
+    assert "#pragma unroll" in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index a5a7ddeda..35e8a074d 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -3,18 +3,18 @@
 import tilelang.testing
 
 
+# TODO: var init is not supported on hip.
+@tilelang.testing.requires_cuda
 def test_var_assign() -> None:
-
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
-
         @T.prim_func
-        def test_var_assign(A: T.Tensor((2,), 'int32')):
+        def test_var_assign(A: T.Tensor((2,), T.int32)):
             with T.Kernel(1) as _:
-                a = T.alloc_var('int32', init=1)
-                b = T.alloc_var('int32', init=a)  # b gets value of a
+                a = T.alloc_var(T.int32, init=1)
+                b = T.alloc_var(T.int32, init=a)  # b gets value of a
                 a = 2
-                d = T.alloc_var('int32', init=a)  # c gets new value of a
+                d = T.alloc_var(T.int32, init=a)  # c gets new value of a
                 A[0] = b
                 A[1] = d
 
@@ -28,5 +28,5 @@ def test_var_assign(A: T.Tensor((2,), 'int32')):
     assert res[1] == 2
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index cee8b5a63..0946c90ca 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -2,15 +2,16 @@
 import tilelang.testing
 import tilelang.language as T
 
+from tilelang.intrinsics import make_mma_swizzle_layout
+import pytest
+
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test(N, M, stride_A, stride_B):
-    assert N % 128 == 0 and M % 128 == 0
-
     @T.prim_func
     def main(
-            A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
-            B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
+        A: T.StridedTensor[(N, M), (1, stride_A), T.float32],  # noqa: F821
+        B: T.StridedTensor[(N, M), (1, stride_B), T.float32],  # noqa: F821
     ):
         with T.Kernel(M // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
@@ -23,6 +24,7 @@ def main(
 
 
 def run_vectorize(N, M, stride_A, stride_B):
+    assert N % 128 == 0 and M % 128 == 0
     assert stride_A >= N and stride_B >= N
 
     jit_kernel = vectorize_test(N, M, stride_A, stride_B)
@@ -39,9 +41,7 @@ def run_vectorize(N, M, stride_A, stride_B):
     code = jit_kernel.get_kernel_source()
 
     vectorize_size = 1
-    while vectorize_size <= 2 and \
-          stride_A % (vectorize_size * 2) == 0 and \
-          stride_B % (vectorize_size * 2) == 0:
+    while vectorize_size <= 2 and stride_A % (vectorize_size * 2) == 0 and stride_B % (vectorize_size * 2) == 0:
         vectorize_size *= 2
 
     if vectorize_size == 4:
@@ -59,5 +59,144 @@ def test_vectorize():
     run_vectorize(N, M, N + 8, N + 16)
 
 
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
+def vectorize_test_invariant_index(N, M, K):
+    @T.prim_func
+    def main(
+        A: T.Tensor[(N, M), T.float32],  # noqa: F821
+        B: T.Tensor[(N, M), T.float32],  # noqa: F821
+        C: T.Tensor[(N, M // K), T.float32],  # noqa: F821
+    ):
+        with T.Kernel(N // 128, threads=128) as (bx):
+            tx = T.get_thread_binding(0)
+            row = bx * 128 + tx
+
+            for col in T.vectorized(M):
+                B[row, col] = A[row, col] * C[row, col // K]
+
+    return main
+
+
+def run_vectorize_invariant_index(N, M, K):
+    assert N % 128 == 0 and M % K == 0
+
+    jit_kernel = vectorize_test_invariant_index(N, M, K)
+
+    a = torch.randn(N, M, device="cuda", dtype=torch.float32)
+    b = torch.zeros(N, M, device="cuda", dtype=torch.float32)
+    c = torch.randn(N, M // K, device="cuda", dtype=torch.float32)
+
+    jit_kernel(a, b, c)
+
+    indices = torch.arange(a.size(1)) // K
+    ret = a * c[:, indices]
+    torch.testing.assert_close(b, ret, atol=1e-8, rtol=1e-8)
+
+    code = jit_kernel.get_kernel_source()
+
+    vectorize_size = 1
+    while vectorize_size <= 2 and K % (vectorize_size * 2) == 0:
+        vectorize_size *= 2
+
+    if vectorize_size == 4:
+        assert "float4" in code
+    elif vectorize_size == 2:
+        assert "float2" in code
+
+
+def test_vectorize_invariant_index():
+    N, M = 512, 256
+
+    run_vectorize_invariant_index(N, M, 2)
+    run_vectorize_invariant_index(N, M, 4)
+    run_vectorize_invariant_index(N, M * 3, 6)
+    run_vectorize_invariant_index(N, M, 8)
+    run_vectorize_invariant_index(N, M * 3, 12)
+    run_vectorize_invariant_index(N, M * 7, 14)
+
+
+@tilelang.jit
+def vectorize_test_all_dtypes(dtype, vec_num):
+    @T.prim_func
+    def main(A: T.Tensor[(64,), dtype]):
+        with T.Kernel(1, threads=256):
+            for i in T.vectorized(vec_num):
+                A[i] = T.cast(i + 1, dtype)
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.float8_e8m0fnu,
+    ],
+)
+@pytest.mark.parametrize("vec_num", [1, 2, 4, 8])
+def test_vectorize_all_dtypes(dtype, vec_num):
+    x = torch.empty((64,), dtype=dtype, device="cuda")
+    kernel = vectorize_test_all_dtypes(dtype, vec_num)
+    kernel(x)
+
+
+@tilelang.jit
+def vectorize_broadcast_int8(vec_num):
+    with T.Kernel(1, threads=128):
+        a = T.alloc_local((64,), "int8")
+        b = T.alloc_var("int8")
+
+        for i in T.vectorized(vec_num):
+            a[i] = b
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("vec_num", [4, 32])
+def test_vectorize_broadcast_int8(vec_num):
+    """Test broadcasting a non-constant int8 value to a vectorized store."""
+    vectorize_broadcast_int8.compile(vec_num=vec_num)
+
+
+@tilelang.jit
+def vectorize_test_call_infinity():
+    A = T.empty((4,), dtype=T.float32)
+    with T.Kernel(1, threads=128):
+        for i in T.vectorized(4):
+            A[i] = T.infinity(T.float32)
+    return A
+
+
+def test_vectorize_call_infinity():
+    kernel = vectorize_test_call_infinity.compile()
+    assert "float4" in kernel.get_kernel_source()
+
+
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_ENABLE_VECTORIZE_PLANNER_VERBOSE: True})
+def vectorize_test_call_bitwise_logical():
+    A = T.empty((128, 32), dtype=T.float32)
+    with T.Kernel(1, threads=128):
+        A_shared = T.alloc_shared((128, 32), dtype=T.float32)
+        T.annotate_layout({A_shared: make_mma_swizzle_layout(A_shared)})
+        for i, j in T.Parallel(128, 32):
+            A_shared[i, j] = A[i, j]
+    return A
+
+
+def test_vectorize_call_bitwise_logical():
+    kernel = vectorize_test_call_bitwise_logical.compile()
+    print(kernel.get_kernel_source())
+    assert "float4" in kernel.get_kernel_source()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index afb8a05d3..108258904 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -1,15 +1,8 @@
+import pytest
 import torch
 import tilelang.testing
 import tilelang.language as T
 
-str2dtype = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "float8_e5m2": torch.float8_e5m2,
-}
-
 
 @tilelang.jit
 def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
@@ -17,8 +10,8 @@ def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             T.copy(A, B)
@@ -32,8 +25,8 @@ def parallel_vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             A_local = T.alloc_fragment((M,), dtype_A)
@@ -47,60 +40,110 @@ def main(
     return main
 
 
-def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str, lanes: int = 2):
+def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str, lanes: int = 2):
     """Run the vectorized cast kernel and check the correctness.
     Args:
-        src_dtype_str: The source data type string.
-        dst_dtype_str: The destination data type string.
+        src_dtype: The source data type.
+        dst_dtype: The destination data type.
         check_str: Used to ensure vectorized cast is used.
         lanes: The number of lanes of the source and destination data types.
     """
 
     M = 128 * lanes
-    kernel = vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
-    kernel_parallel = parallel_vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
-
-    A = torch.randn(M, dtype=str2dtype[src_dtype_str]).cuda()
-    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
-    C = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
-
-    kernel(A, B)
-    kernel_parallel(A, C)
-
-    torch.testing.assert_close(A.to(str2dtype[dst_dtype_str]), B)
-    torch.testing.assert_close(A.to(str2dtype[dst_dtype_str]), C)
+    kernel = vectorized_cast_kernel(M, src_dtype, dst_dtype)
+    kernel_parallel = parallel_vectorized_cast_kernel(M, src_dtype, dst_dtype)
 
     code = kernel.get_kernel_source()
     code_parallel = kernel_parallel.get_kernel_source()
+    assert check_str in code and check_str in code_parallel, f"Cast {src_dtype} to {dst_dtype} with {lanes=} is not vectorized!"
 
-    assert check_str in code and check_str in code_parallel, \
-        f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
-
-
-def test_vectorized_cast():
-    # fp32 -> fp16
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
+    # Requires torch >= 2.8
+    if src_dtype == T.float8_e8m0fnu or dst_dtype == T.float8_e8m0fnu:
+        return
 
-    # fp16 -> fp32
-    run_vectorized_cast("float16", "float32", "__half22float2", 2)
-    run_vectorized_cast("float16", "float32", "__half22float2", 4)
+    if src_dtype == T.float4_e2m1fn or dst_dtype == T.float4_e2m1fn:
+        return
 
-    # fp32 -> fp8_e4m3
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
+    A_float = torch.randn(M, dtype=torch.float32, device="cuda")
+    A = A_float.to(src_dtype.as_torch())
 
-    # fp32 -> fp8_e5m2
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
+    A = A_float.to(src_dtype.as_torch())
+    B = torch.zeros(M, dtype=dst_dtype.as_torch(), device="cuda")
+    C = torch.zeros(M, dtype=dst_dtype.as_torch(), device="cuda")
 
-    # fp32 -> bf16
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 2)
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 4)
+    kernel(A, B)
+    kernel_parallel(A, C)
 
-    # bf16 -> fp32
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
+    torch.testing.assert_close(A.to(dst_dtype.as_torch()), B)
+    torch.testing.assert_close(A.to(dst_dtype.as_torch()), C)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        (T.float32, T.float16, "__float22half2_rn", 2),
+        (T.float32, T.float16, "__float22half2_rn", 4),
+        (T.float16, T.float32, "__half22float2", 2),
+        (T.float16, T.float32, "__half22float2", 4),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 2),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 4),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 2),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 4),
+    ],
+)
+def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 9)
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        # FP8 <-> FP32
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        # E8M0 <-> BFloat16
+        (T.float8_e8m0fnu, T.bfloat16, "__tl_cvt_e8m0x2_to_bfloat162", 2),
+        (T.bfloat16, T.float8_e8m0fnu, "__tl_cvt_bfloat162_to_e8m0x2", 2),
+        # Float -> E8M0
+        (T.float32, T.float8_e8m0fnu, "__tl_cvt_float2_to_e8m0x2", 2),
+        # Double -> E8M0
+        (T.float64, T.float8_e8m0fnu, "__tl_cvt_double2_to_e8m0x2", 2),
+    ],
+)
+def test_vectorized_cast_fp8(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        # FP4 <-> Half
+        (T.float4_e2m1fn, T.float16, "__tl_cvt_fp4x2_to_half2", 2),
+        (T.float16, T.float4_e2m1fn, "__tl_cvt_half2_to_fp4x2", 2),
+        # FP4 <-> Float
+        (T.float4_e2m1fn, T.float32, "__tl_cvt_fp4x2_to_float2", 2),
+        (T.float32, T.float4_e2m1fn, "__tl_cvt_float2_to_fp4x2", 2),
+        # FP4 <-> Double
+        (T.float4_e2m1fn, T.float64, "__tl_cvt_fp4x2_to_double2", 2),
+        (T.float64, T.float4_e2m1fn, "__tl_cvt_double2_to_fp4x2", 2),
+        # FP4 <-> BFloat16
+        (T.float4_e2m1fn, T.bfloat16, "__tl_cvt_fp4x2_to_bfloat162", 2),
+        (T.bfloat16, T.float4_e2m1fn, "__tl_cvt_bfloat162_to_fp4x2", 2),
+    ],
+)
+def test_vectorized_cast_fp4(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index c16c51852..dc4c3711b 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -1,14 +1,15 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import pytest
 
 
 def view_test(N, M, dtype, new_dtype=None):
-    import tilelang.language as T
-
     new_shape = [N // M, M]
     if new_dtype:
         from tvm import DataType
+
         dtype_src = DataType(dtype)
         dtype_dst = DataType(new_dtype)
         src_bits = dtype_src.bits
@@ -18,8 +19,8 @@ def view_test(N, M, dtype, new_dtype=None):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
     ):
         with T.Kernel(1) as _:
             A_viewed = T.view(A, new_shape, dtype=new_dtype)
@@ -35,8 +36,7 @@ def run_view(N, M, dtype, new_dtype=None):
 
     def ref_program(A):
         if new_dtype:
-            from tilelang.utils.tensor import map_torch_type
-            torch_dtype = map_torch_type(new_dtype)
+            torch_dtype = T.dtype(new_dtype).as_torch()
             return A.view(N // M, M).view(dtype=torch_dtype)
         return A.view(N // M, M)
 
@@ -44,14 +44,42 @@ def ref_program(A):
 
 
 def test_reshape_view():
-
     # Test view with same dtype
-    run_view(1024, 32, "float32")
-    run_view(2048, 64, "float16")
+    run_view(1024, 32, T.float32)
+    run_view(2048, 64, T.float16)
 
     # Test view with dtype conversion
-    run_view(1024, 32, "float32", "float16")
-    run_view(2048, 64, "float16", "float32")
+    run_view(1024, 32, T.float32, T.float16)
+    run_view(2048, 64, T.float16, T.float32)
+
+
+def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
+    new_shape = [N // M, M + 1]
+    if new_dtype:
+        from tvm import DataType
+
+        dtype_src = DataType(dtype)
+        dtype_dst = DataType(new_dtype)
+        src_bits = dtype_src.bits
+        dst_bits = dtype_dst.bits
+        scale = src_bits / dst_bits
+        new_shape[-1] = int(M * scale)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_viewed = T.view(A, new_shape, dtype=new_dtype)
+            T.copy(A_viewed, B)
+
+    return main
+
+
+def test_view_shape_mismatch():
+    with pytest.raises(AssertionError):
+        view_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
new file mode 100644
index 000000000..a8868013d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -0,0 +1,82 @@
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def get_kernel(reduce_op: str, dtype: str):
+    assert reduce_op in ["sum", "max", "min", "bitand", "bitor"]
+
+    @T.prim_func
+    def main(x: T.Tensor((32), dtype)):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding(0)
+            local_val = T.alloc_local([1], dtype)
+            local_val[0] = x[tx]
+            reduced_val = T.alloc_local([1], dtype)
+            if reduce_op == "sum":
+                reduced_val[0] = T.warp_reduce_sum(local_val[0])
+            elif reduce_op == "max":
+                reduced_val[0] = T.warp_reduce_max(local_val[0])
+            elif reduce_op == "min":
+                reduced_val[0] = T.warp_reduce_min(local_val[0])
+            elif reduce_op == "bitand":
+                reduced_val[0] = T.warp_reduce_bitand(local_val[0])
+            elif reduce_op == "bitor":
+                reduced_val[0] = T.warp_reduce_bitor(local_val[0])
+            x[tx] = reduced_val[0]
+
+    return main
+
+
+def test_warp_reduce_sum():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("sum", T.float32)
+    ref = torch.full_like(a, a.sum())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_max():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("max", T.float32)
+    print(kernel.get_kernel_source())
+    ref = torch.full_like(a, a.max())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_min():
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("min", T.float32)
+    ref = torch.full_like(a, a.min())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitand():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitand", T.int32)
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val & a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitor():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitor", T.int32)
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val | a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_warp_sync.py b/testing/python/language/test_tilelang_language_warp_sync.py
new file mode 100644
index 000000000..4c9aaff2a
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_sync.py
@@ -0,0 +1,62 @@
+import tilelang
+import tilelang.language as T
+import torch
+from tvm import tir
+import tilelang.testing
+
+
+@tilelang.jit
+def kernel_with_warp_sync():
+    @T.prim_func
+    def main(
+        A: T.Tensor((1,), "int32"),
+        B: T.Tensor((1,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            if tx == 0:
+                tir.call_extern("void", "__nanosleep", 100)
+                A[0] = -1
+            T.sync_warp()
+            if tx == 1:
+                B[0] = A[0]
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_warp_sync():
+    a = torch.empty((1), device="cuda", dtype=torch.int32)
+    b = torch.empty((1), device="cuda", dtype=torch.int32)
+    kernel = kernel_with_warp_sync()
+    assert "__syncwarp" in kernel.get_kernel_source()
+    kernel(a, b)
+    assert b[0] == -1
+
+
+@tilelang.jit
+def kernel_with_shfl_sync():
+    @T.prim_func
+    def main(
+        A: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = tx * 10
+            broadcast = T.shfl_sync(0xFFFFFFFF, val, 31)
+            A[tx] = broadcast
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_shfl_sync():
+    a = torch.empty((32), device="cuda", dtype=torch.int32)
+    kernel = kernel_with_shfl_sync()
+    assert "__shfl_sync" in kernel.get_kernel_source()
+    kernel(a)
+    assert torch.all(a == 310)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_memory_leak.py b/testing/python/language/test_tilelang_memory_leak.py
new file mode 100644
index 000000000..7da187fa3
--- /dev/null
+++ b/testing/python/language/test_tilelang_memory_leak.py
@@ -0,0 +1,79 @@
+import tvm_ffi
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+import weakref
+import gc
+
+
+def test_tilelang_globals_leak():
+    @tilelang.jit(
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    def get_dummy_kernel():
+        @T.prim_func
+        def dummy_kernel(
+            a: T.Tensor[(1,), T.float32],
+        ):
+            with T.Kernel(1) as _:
+                a[0] = 1
+
+        return dummy_kernel
+
+    a = torch.randn(1, 1024)
+    a_weak = weakref.ref(a)
+    _kernel = get_dummy_kernel()
+    del a
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+    a_upgrade = a_weak()
+    assert a_upgrade is None, "A is not garbage collected"
+
+    # use objgraph to debug
+    # if a_upgrade is not None:
+    #     objgraph.show_backrefs([a_upgrade], max_depth=5)
+
+
+def test_error_no_cyclic_reference() -> None:
+    # This test case ensures that when an error is raised from C++ side,
+    # there is no cyclic reference that slows down the garbage collection.
+    # Please see `_with_append_backtrace` in error.py
+
+    # temporarily disable gc
+    gc.disable()
+
+    try:
+        # We should create a class as a probe to detect gc activity
+        # because weakref doesn't support list, dict or other trivial types
+        class SampleObject: ...
+
+        # trigger a C++ side KeyError by accessing a non-existent key
+        def trigger_cpp_side_error() -> None:
+            try:
+                tmp_map = tvm_ffi.Map(dict())
+                tmp_map["a"]
+            except KeyError:
+                pass
+
+        def may_create_cyclic_reference() -> weakref.ReferenceType:
+            obj = SampleObject()
+            trigger_cpp_side_error()
+            return weakref.ref(obj)
+
+        wref = may_create_cyclic_reference()
+
+        # if the object is not collected, wref() will return the object
+        assert wref() is None, "Cyclic reference occurs inside error handling pipeline"
+
+    finally:
+        # re-enable gc whenever exception occurs
+        gc.enable()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_annotate_loop_layout.py b/testing/python/layout/test_tilelang_annotate_loop_layout.py
new file mode 100644
index 000000000..52653a9d1
--- /dev/null
+++ b/testing/python/layout/test_tilelang_annotate_loop_layout.py
@@ -0,0 +1,109 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def loop_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        for i, j in T.Parallel(M, N, loop_layout=loop_layout):
+            B[i, j] = A[i, j]
+
+
+@tilelang.testing.requires_cuda
+def test_loop_layout_fragment_vec4():
+    def loop_layout_fn(i, j):
+        elems = i * 32 + j
+        forward_thread = (elems // 4) % 128
+        forward_local = elems % 4 + (elems // 512) * 4
+        return forward_thread, forward_local
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn)
+    kernel = loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+
+    # Expect vectorized copy along innermost dimension (float4)
+    assert "*(float4*)(B + ((i * 512) + (((int)threadIdx.x) * 4))) = *(float4*)(A + ((i * 512) + (((int)threadIdx.x) * 4)));" in code
+
+
+@tilelang.testing.requires_cuda
+def test_loop_layout_identity():
+    def loop_layout_fn(i, j):
+        forward_thread = i
+        forward_local = j
+        return forward_thread, forward_local
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn)
+    kernel = loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+    assert "*(float4*)(B + ((((int)threadIdx.x) * 32) + (i * 4))) = *(float4*)(A + ((((int)threadIdx.x) * 32) + (i * 4)));" in code
+
+
+@tilelang.jit
+def copy_with_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        T.copy(A, B, loop_layout=loop_layout)
+
+
+@tilelang.testing.requires_cuda
+def test_copy_loop_layout_annotated_replicate_vec4():
+    def loop_layout_fn(i, j, rep):
+        elems = i * 32 + j
+        fth = (elems // 4) % 64 + rep * 64
+        floc = elems % 4 + (elems // (64 * 4)) * 4
+        return fth, floc
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn, replicate=2)
+    kernel = copy_with_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+
+    assert (
+        "*(float4*)(B + ((i * 256) + ((((int)threadIdx.x) & 63) * 4))) = *(float4*)(A + ((i * 256) + ((((int)threadIdx.x) & 63) * 4)));"
+        in code
+    )
+
+
+@tilelang.jit
+def replicate_loop_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        for i, j in T.Parallel(M, N, loop_layout=loop_layout):
+            B[i, j] = A[i, j]
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_annotate_replicate_loop_layout_vec4():
+    M, N = 128, 32
+
+    def loop_layout_fn(i, j, rep):
+        elems = i * 32 + j
+        forward_thread = (elems // 4) % 64 + rep * 64
+        forward_local = elems % 4 + (elems // (64 * 4)) * 4
+        return forward_thread, forward_local
+
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn, replicate=2)
+    kernel = replicate_loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+    assert (
+        "*(float4*)(B + ((i * 256) + ((((int)threadIdx.x) & 63) * 4))) = *(float4*)(A + ((i * 256) + ((((int)threadIdx.x) & 63) * 4)));"
+        in code
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_equal.py b/testing/python/layout/test_tilelang_layout_equal.py
new file mode 100644
index 000000000..994429376
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_equal.py
@@ -0,0 +1,178 @@
+"""Tests for Layout and Fragment equality comparison."""
+
+import tilelang
+import tilelang.testing
+from tilelang.layout import Layout
+from tilelang.layout.fragment import Fragment
+
+tilelang.testing.set_random_seed()
+
+
+class TestLayoutEqual:
+    """Test cases for Layout.is_equal()."""
+
+    def test_same_layout_is_equal(self):
+        """Two layouts with identical mapping should be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: i * 4 + j)
+        assert layout1.is_equal(layout2)
+
+    def test_different_index_order_not_equal(self):
+        """Layouts with different index order (i*4+j vs j*4+i) should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: j * 4 + i)
+        assert not layout1.is_equal(layout2)
+
+    def test_different_coefficient_not_equal(self):
+        """Layouts with different coefficients should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: i * 8 + j)
+        assert not layout1.is_equal(layout2)
+
+    def test_different_shape_not_equal(self):
+        """Layouts with different shapes should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([16, 8], lambda i, j: i * 8 + j)
+        assert not layout1.is_equal(layout2)
+
+    def test_same_layout_different_var_names(self):
+        """Layouts with same mapping but created with different variable names should be equal."""
+        layout1 = Layout([32, 4], lambda x, y: x * 4 + y)
+        layout2 = Layout([32, 4], lambda a, b: a * 4 + b)
+        assert layout1.is_equal(layout2)
+
+    def test_2d_output_layout_equal(self):
+        """Layouts with 2D output should compare correctly."""
+        layout1 = Layout([32, 4], lambda i, j: [i, j])
+        layout2 = Layout([32, 4], lambda i, j: [i, j])
+        assert layout1.is_equal(layout2)
+
+    def test_2d_output_layout_different_order(self):
+        """Layouts with swapped output dimensions should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: [i, j])
+        layout2 = Layout([32, 4], lambda i, j: [j, i])
+        assert not layout1.is_equal(layout2)
+
+    def test_complex_expression_equal(self):
+        """Layouts with complex but equivalent expressions should be equal."""
+        layout1 = Layout([16, 8], lambda i, j: i * 8 + j)
+        layout2 = Layout([16, 8], lambda i, j: j + i * 8)
+        # Note: This tests if the comparison handles commutative operations
+        # With StructuralEqual, a*b+c and c+a*b have different AST structure
+        # So this may or may not be equal depending on implementation
+        # For now we test the actual behavior
+        result = layout1.is_equal(layout2)
+        # The key point is it should not crash and return a boolean
+        assert isinstance(result, bool)
+
+
+class TestFragmentEqual:
+    """Test cases for Fragment.is_equal()."""
+
+    def test_same_fragment_is_equal(self):
+        """Two fragments with identical mapping should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        assert frag1.is_equal(frag2)
+
+    def test_different_thread_mapping_not_equal(self):
+        """Fragments with different thread mapping (i*4+j vs j*4+i) should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: j * 4 + i)
+        assert not frag1.is_equal(frag2)
+
+    def test_different_forward_index_not_equal(self):
+        """Fragments with different forward_index should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j, forward_index_fn=lambda i, j: i)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j, forward_index_fn=lambda i, j: j)
+        assert not frag1.is_equal(frag2)
+
+    def test_same_fragment_different_var_names(self):
+        """Fragments with same mapping but different variable names should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda x, y: x * 4 + y)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda a, b: a * 4 + b)
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_with_replicate_equal(self):
+        """Fragments with same replicate factor should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_different_replicate_not_equal(self):
+        """Fragments with different replicate factors should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=2)
+        assert not frag1.is_equal(frag2)
+
+    def test_fragment_with_forward_fn(self):
+        """Fragments created with forward_fn should compare correctly."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i * 4 + j))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i * 4 + j))
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_forward_fn_different_thread(self):
+        """Fragments with different thread mapping via forward_fn should not be equal."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (j * 4 + i, i))
+        assert not frag1.is_equal(frag2)
+
+    def test_fragment_forward_fn_different_index(self):
+        """Fragments with different forward_index via forward_fn should not be equal."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, j))
+        assert not frag1.is_equal(frag2)
+
+
+class TestLayoutFragmentEdgeCases:
+    """Edge cases and regression tests."""
+
+    def test_single_dim_layout_equal(self):
+        """Single dimension layouts should compare correctly."""
+        layout1 = Layout([128], lambda i: i)
+        layout2 = Layout([128], lambda i: i)
+        assert layout1.is_equal(layout2)
+
+    def test_single_dim_layout_not_equal(self):
+        """Single dimension layouts with different mappings should not be equal."""
+        layout1 = Layout([128], lambda i: i)
+        layout2 = Layout([128], lambda i: i * 2)
+        assert not layout1.is_equal(layout2)
+
+    def test_three_dim_layout_equal(self):
+        """Three dimension layouts should compare correctly."""
+        layout1 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        layout2 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        assert layout1.is_equal(layout2)
+
+    def test_three_dim_layout_different_order(self):
+        """Three dimension layouts with different index order should not be equal."""
+        layout1 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        layout2 = Layout([8, 16, 4], lambda i, j, k: k * 64 + j * 4 + i)
+        assert not layout1.is_equal(layout2)
+
+    def test_fragment_empty_forward_index(self):
+        """Fragments with empty forward_index should compare correctly."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        assert frag1.is_equal(frag2)
+
+    def test_constant_layout_equal(self):
+        """Layouts mapping to constant should be equal."""
+        from tvm.tir import const
+
+        layout1 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        layout2 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        assert layout1.is_equal(layout2)
+
+    def test_constant_vs_variable_layout_not_equal(self):
+        """Layout mapping to constant vs variable should not be equal."""
+        from tvm.tir import const
+
+        layout1 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        layout2 = Layout([32, 4], lambda i, j: i)
+        assert not layout1.is_equal(layout2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
index d67a87bc3..8aa5f6c42 100644
--- a/testing/python/layout/test_tilelang_layout_fused_replicate.py
+++ b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -12,19 +12,18 @@
 
 @tilelang.jit
 def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
-
     @T.prim_func
     def main(
-            a: T.Buffer((B, M, N), "bfloat16"),
-            a_out: T.Buffer((B, M, N), "float32"),
+        a: T.Buffer((B, M, N), T.bfloat16),
+        a_out: T.Buffer((B, M, N), T.float32),
     ):
         with T.Kernel(
-                T.ceildiv(M, BLOCK_MN),
-                T.ceildiv(N, BLOCK_K),
-                B,
-                threads=128,
+            T.ceildiv(M, BLOCK_MN),
+            T.ceildiv(N, BLOCK_K),
+            B,
+            threads=128,
         ) as (pid_m, pid_n, pid_b):
-            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), "float32")
+            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), T.float32)
             offs_m = pid_m * BLOCK_MN
             offs_n = pid_n * BLOCK_K
 
diff --git a/testing/python/layout/test_tilelang_layout_inference.py b/testing/python/layout/test_tilelang_layout_inference.py
new file mode 100644
index 000000000..831d9d8ef
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_inference.py
@@ -0,0 +1,36 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit
+def _tilelang_issue_layout_free_inference_choose_smallest_replication():
+    @T.prim_func
+    def main(A: T.Tensor((128, 4), T.float), B: T.Tensor((128, 4), T.float)):
+        with T.Kernel(1, threads=128) as _:
+            A_frag = T.alloc_fragment((128, 4), T.float)
+            B_frag = T.alloc_fragment((128, 4), T.float)
+            S_frag = T.alloc_fragment((4,), T.float)
+            T.annotate_layout(
+                {
+                    A_frag: T.Fragment(A_frag.shape, lambda i, j: (i, j)),
+                }
+            )
+            for i, j in T.Parallel(128, 4):
+                A_frag[i, j] = S_frag[j]
+            for i, j in T.Parallel(128, 4):
+                B_frag[i, j] = S_frag[j]
+
+    return main
+
+
+def test_tilelang_issue_layout_free_inference_choose_smallest_replication():
+    kernel = _tilelang_issue_layout_free_inference_choose_smallest_replication()
+    source = kernel.get_kernel_source()
+    assert "float S_frag[4];" in source, "S_frag is not in the source"
+    assert "float B_frag[4];" in source, "B_frag is not in the source"
+    assert "float A_frag[4];" in source, "A_frag is not in the source"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 9c2294669..044e0ea37 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -19,18 +19,17 @@ def bitwise_reduce(
     func,
     clear=True,
 ):
-
     @T.prim_func
     def reduce_func(
-            A: T.Tensor((M, N), "int32"),
-            B: T.Tensor((M), "int32"),
-            Output: T.Tensor((M), "int32"),
+        A: T.Tensor((M, N), T.int32),
+        B: T.Tensor((M), T.int32),
+        Output: T.Tensor((M), T.int32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_N), "int32")
-            A_fragment = T.alloc_fragment((block_M, block_N), "int32")
-            B_shared = T.alloc_shared((block_M,), "int32")
-            B_fragment = T.alloc_fragment((block_M), "int32")
+            A_shared = T.alloc_shared((block_M, block_N), T.int32)
+            A_fragment = T.alloc_fragment((block_M, block_N), T.int32)
+            B_shared = T.alloc_shared((block_M,), T.int32)
+            B_fragment = T.alloc_fragment((block_M), T.int32)
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(A_shared, A_fragment)
             T.copy(B[by * block_M], B_shared)
@@ -64,7 +63,7 @@ def run_single_bitwise_reduce(
             row_pattern = (i & 0xF) << (i % 4)  # 4-bit patterns shifted by row
 
             # Column-based pattern: different bit positions set based on column
-            col_pattern = (1 << (j % 31))  # Single bit set at different positions
+            col_pattern = 1 << (j % 31)  # Single bit set at different positions
 
             # Combine patterns with XOR to create diverse bit distributions
             # Add some deterministic "noise" based on position
@@ -76,7 +75,7 @@ def run_single_bitwise_reduce(
             if i % 4 == 0:
                 a[i, j] &= ~(0x1 << (i // 4))
             elif i % 2 == 0:
-                a[i, j] |= (0x1 << (i // 2))
+                a[i, j] |= 0x1 << (i // 2)
 
     if name == "reduce_bitand":
         expected = torch.full((M,), -1, device="cuda", dtype=torch.int32)
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index c3b5d1b52..3c50e95f4 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -7,16 +7,16 @@
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +27,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +57,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +71,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +85,22 @@ def main(
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +109,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +119,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +133,7 @@ def main(
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +172,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +188,19 @@ def main(
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +209,19 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -290,7 +273,7 @@ def test_mathops_generate_no_fastmath():
     ]
 
     for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
+        run_single_arg_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
@@ -304,7 +287,7 @@ def test_two_arg_mathops_fastmath():
     ]
 
     for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+        run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -329,7 +312,7 @@ def test_fastmath_versions():
     ]
 
     for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
+        run_fastmath_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 0b04e3bab..5d4988002 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -5,14 +5,7 @@
 import pytest
 
 
-def run_ieee_math_test(mathop_name,
-                       mathop_func,
-                       rounding_mode="rn",
-                       M=128,
-                       N=128,
-                       block_M=32,
-                       block_N=32,
-                       dtype="float32"):
+def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -22,18 +15,19 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
-                D: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+            D: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    D[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i, bx * block_N + j],
-                                                      C[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    D[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j],
+                        B[by * block_M + i, bx * block_N + j],
+                        C[by * block_M + i, bx * block_N + j],
+                        rounding_mode,
+                    )
 
         out_idx = [3]
         num_inputs = 3
@@ -41,16 +35,15 @@ def main_func(
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    C[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    C[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j], rounding_mode
+                    )
 
         out_idx = [2]
         num_inputs = 2
@@ -58,14 +51,12 @@ def main_func(
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    B[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      rounding_mode)
+                    B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j], rounding_mode)
 
         out_idx = [1]
         num_inputs = 1
@@ -77,13 +68,14 @@ def main_func(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
     print(f"✓ {mathop_name} compilation test passed")
 
     # Test numerical execution
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     if num_inputs >= 2:
@@ -194,8 +186,8 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-            A: T.Tensor((128, 128), "float32"),
-            B: T.Tensor((128, 128), "float32"),
+        A: T.Tensor((128, 128), T.float32),
+        B: T.Tensor((128, 128), T.float32),
     ):
         with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
             for i, j in T.Parallel(32, 32):
@@ -207,7 +199,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print("\n=== Testing ieee_frsqrt (rn only) ===")
     print("✓ ieee_frsqrt compilation test passed")
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index 22f4beb89..10e349246 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -5,18 +5,17 @@
 import torch
 
 
-@tilelang.jit(execution_backend='torch')
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float32", accum_dtype="float"):
-
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype, scope='shared')
-            B_shared = T.alloc_shared((block_K, block_N), dtype, scope='shared')
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
@@ -25,8 +24,9 @@ def gemm(
                 T.copy(A[by * block_M, ko * block_K], A_shared, coalesced_width=2)
                 T.copy(B[ko * block_K, bx * block_N], B_shared, coalesced_width=2)
 
-                for i, j, k in T.Parallel(block_M, block_N, block_K):
-                    C_local[i, j] += A_shared[i, k] * B_shared[k, j]
+                for i, j in T.Parallel(block_M, block_N):
+                    for k in T.Serial(block_K):
+                        C_local[i, j] += A_shared[i, k] * B_shared[k, j]
 
             T.copy(C_local, C[by * block_M, bx * block_N], coalesced_width=2)
 
@@ -40,21 +40,21 @@ def assert_gemm(
     block_M,
     block_N,
     block_K,
-    dtype="float32",
-    accum_dtype="float",
+    dtype=T.float32,
+    accum_dtype=T.float32,
     atol=1e-8,
 ):
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
 
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a, b = None, None
-    if 'int' in dtype:
-        a = torch.randint(100, (M, K), dtype=torch_dtype, device='mps')
-        b = torch.randint(100, (K, N), dtype=torch_dtype, device='mps')
+    if "int" in dtype:
+        a = torch.randint(100, (M, K), dtype=torch_dtype, device="mps")
+        b = torch.randint(100, (K, N), dtype=torch_dtype, device="mps")
     else:
-        a = torch.randn(M, K, dtype=torch_dtype, device='mps')
-        b = torch.randn(K, N, dtype=torch_dtype, device='mps')
-    c = torch.zeros(M, N, dtype=torch_dtype, device='mps')
+        a = torch.randn(M, K, dtype=torch_dtype, device="mps")
+        b = torch.randn(K, N, dtype=torch_dtype, device="mps")
+    c = torch.zeros(M, N, dtype=torch_dtype, device="mps")
 
     jit_kernel(a, b, c)
 
@@ -70,12 +70,12 @@ def test_gemm_float32():
 
 @tilelang.testing.requires_metal
 def test_gemm_float16():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='float16', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.float16, atol=1)
 
 
 @tilelang.testing.requires_metal
 def test_gemm_int32():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='int32', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.int32, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
deleted file mode 100644
index fcda9878c..000000000
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ /dev/null
@@ -1,379 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang import primitives as P
-
-
-def matmul_ssr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                P.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_ssr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_ssr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    # TODO(lei): gemm_v2 with tma is not fully tested.
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_gemm_f16f16f16_nt_ssr():
-    run_matmul_ssr(
-        16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
-    run_matmul_ssr(
-        128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
-    run_matmul_ssr(
-        1024,
-        1024,
-        1024,
-        False,
-        True,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        128,
-        32,
-        2,
-        num_threads=128)
-
-
-def matmul_rsr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.copy(A_shared, A_local)
-                P.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-                # T.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rsr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rsr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# TODO(lei): Fix the test case in future release
-# Now it has some bugs related to is_m_first
-# def test_gemm_f16f16f16_nt_rsr():
-#     run_matmul_rsr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         0,
-#         num_threads=128,
-#     )
-
-
-def matmul_rrr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    B_local_shape = B_shared_shape
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            B_local = T.alloc_fragment(B_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                    T.copy(A_shared, A_local)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(A_shared, A_local)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.copy(B_shared, B_local)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                    T.copy(B_shared, B_local)
-                P.gemm(A_local, B_local, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rrr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rrr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# def test_gemm_f16f16f16_nt_rrr():
-#     run_matmul_rrr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         2,
-#     )
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index ee46725b9..09d894c59 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py b/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py
new file mode 100644
index 000000000..d9aa4fc25
--- /dev/null
+++ b/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py
@@ -0,0 +1,150 @@
+"""Test for profiler with dynamic symbolic constraints."""
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import torch
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def test_profiler_dynamic_symbolic_single():
+    """Test profiler with a single dynamic symbolic variable."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Test with dynamic_symbolic_constraints
+    latency = profiler.do_bench(dynamic_symbolic_constraints={"m": 256})
+    assert latency > 0, f"Expected positive latency, got {latency}"
+    print(f"Latency (m=256): {latency:.3f} ms")
+
+    # Test with different M values
+    for m_val in [128, 256, 512]:
+        latency = profiler.do_bench(dynamic_symbolic_constraints={"m": m_val})
+        assert latency > 0, f"Expected positive latency for m={m_val}, got {latency}"
+        print(f"Latency (m={m_val}): {latency:.3f} ms")
+
+
+def test_profiler_dynamic_symbolic_multiple():
+    """Test profiler with multiple dynamic symbolic variables."""
+    M = T.dynamic("m")
+    N = T.dynamic("n")
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Test with multiple dynamic_symbolic_constraints
+    latency = profiler.do_bench(dynamic_symbolic_constraints={"m": 256, "n": 256})
+    assert latency > 0, f"Expected positive latency, got {latency}"
+    print(f"Latency (m=256, n=256): {latency:.3f} ms")
+
+    # Test with different M and N values
+    for m_val, n_val in [(128, 128), (256, 512), (512, 256)]:
+        latency = profiler.do_bench(dynamic_symbolic_constraints={"m": m_val, "n": n_val})
+        assert latency > 0, f"Expected positive latency for m={m_val}, n={n_val}, got {latency}"
+        print(f"Latency (m={m_val}, n={n_val}): {latency:.3f} ms")
+
+
+def test_profiler_dynamic_symbolic_correctness():
+    """Test that kernel with dynamic symbolic produces correct results."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+    # Test correctness with different M values
+    for m_val in [128, 256, 512]:
+        a = torch.randn(m_val, K, dtype=torch.float16, device="cuda")
+        b = torch.randn(K, N, dtype=torch.float16, device="cuda")
+
+        c = kernel(a, b)
+        ref_c = a @ b
+
+        torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+        print(f"Correctness test passed for m={m_val}")
+
+
+def test_profiler_dynamic_symbolic_missing_constraint():
+    """Test that missing constraint raises appropriate error."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Test that missing constraint raises ValueError
+    try:
+        profiler.do_bench(dynamic_symbolic_constraints={"wrong_name": 256})
+        raise ValueError("Expected ValueError for missing constraint")
+    except ValueError as e:
+        assert "m" in str(e), f"Error message should mention missing variable 'm', got: {e}"
+        print(f"Correctly raised error for missing constraint: {e}")
+
+
+def test_profiler_dynamic_symbolic_with_input_tensors():
+    """Test that input_tensors takes precedence over dynamic_symbolic_constraints."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Manually create input tensors
+    # Note: out_idx=[-1] means C is output, so we only need A and B as inputs
+    concrete_M = 512
+    a = torch.randn(concrete_M, K, dtype=torch.float16, device="cuda")
+    b = torch.randn(K, N, dtype=torch.float16, device="cuda")
+
+    # input_tensors should take precedence
+    latency = profiler.do_bench(input_tensors=[a, b])
+    assert latency > 0, f"Expected positive latency, got {latency}"
+    print(f"Latency with manual input_tensors (M={concrete_M}): {latency:.3f} ms")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
new file mode 100644
index 000000000..083373eb7
--- /dev/null
+++ b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.jit
+def dynamic_smem_kernel():
+    # Symbolic length to drive dynamic shared memory allocation
+    length = T.symbolic("len", dtype=T.int32)  # noqa: F821
+
+    @T.prim_func
+    def main(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
+        # Launch a simple kernel that copies from global memory into shared memory
+        # using a dynamically-sized allocation. No writes back to global_tensor.
+        with T.Kernel(1, threads=32) as _:
+            buffer_shared = T.alloc_shared((length,), dtype=T.int32)  # noqa: F821
+            T.copy(buffer_shared, global_tensor)
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def _run_and_check(kernel, n):
+    a = _require_cuda_tensor((n,), torch.int32)
+    kernel(a)
+    torch.cuda.synchronize()
+
+
+def test_dynamic_shared_memory_varies_across_calls():
+    kernel = dynamic_smem_kernel()
+
+    # Run with different dynamic shared memory sizes across invocations
+    _run_and_check(kernel, 100)
+    _run_and_check(kernel, 200)
+    # Repeat sizes to exercise attribute caching path
+    _run_and_check(kernel, 200)
+    _run_and_check(kernel, 100)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index d984ad4bc..e2a217563 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -1,5 +1,7 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang.utils import determine_fp8_type
 import pytest
 
 
@@ -23,13 +25,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -89,7 +89,8 @@ def run_gemm_ss(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
@@ -108,30 +109,51 @@ def ref_program(A, B):
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
-def test_gemm_ss():
-    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
-    # GEMM tests for float16
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    # n8 test
-    run_gemm_ss(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 test
-    run_gemm_ss(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # tfloat32 test
-    run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, determine_fp8_type("e5m2"), determine_fp8_type("e5m2"), T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, determine_fp8_type(), determine_fp8_type(), T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rs(
@@ -155,13 +177,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -169,9 +189,11 @@ def main(
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -225,7 +247,8 @@ def run_gemm_rs(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -243,30 +266,51 @@ def ref_program(A, B):
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
-def test_gemm_rs():
-    # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
-    # n8 tests
-    run_gemm_rs(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
 
-    # int8 tests
-    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
+@pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
-    # float8 tests
-    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
 
-    # float32 tests
-    run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e4m3fnuz, T.float8_e4m3fnuz, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_sr(
@@ -290,13 +334,11 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -304,9 +346,11 @@ def main(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -360,7 +404,8 @@ def run_gemm_sr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -377,31 +422,50 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_sr():
-    # GEMM tests for float16
-    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_sr(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_sr(128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    # TODO(lei): fix in future
-    run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
+        (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        # TODO: There is precision problem needs to repair
+        # (128, 128, 128, True, True, determine_fp8_type("e5m2"), determine_fp8_type("e5m2"), T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, determine_fp8_type(), determine_fp8_type(), T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rr(
@@ -430,9 +494,9 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -441,10 +505,12 @@ def main(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -499,7 +565,8 @@ def run_gemm_rr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -516,31 +583,53 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_rr():
-    # GEMM tests for float16
-    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
-    # n8 tests
-    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2)
-    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2)
-
-    # int8 tests
-    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float, 128, 256, 32, 2, 128),
+        # TODO: There is precision problem when num_stages=2 on ROCm
+        # (128, 16, 128, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 2, 128)
+        # (128, 16, 128, False, True, T.int8, T.int8, T.int32, 128, 16, 32, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        # TODO: There is precision problem needs to repair
+        # (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, determine_fp8_type(), determine_fp8_type(), T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 74b9729f6..b0f4a29c9 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -1,29 +1,33 @@
+import pytest
 import torch
 import tilelang
 import tilelang.testing
-
-from tilelang.utils.sparse import compress, randn_semi_sparse
-from tilelang.layout import make_metadata_layout
-
-torch.set_printoptions(threshold=float('inf'), edgeitems=float('inf'), linewidth=10000)
-torch.manual_seed(42)
-
-STR_TO_TYPE = {
-    'float32': torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "int8": torch.int8,
-    "int32": torch.int32,
-}
-
-SPARSITY_MAP = {
-    # 'float32': (1, 2),  # not supported for now
-    torch.float16: (2, 4),
-    torch.bfloat16: (2, 4),
-    torch.float8_e4m3fn: (2, 4),
-    torch.int8: (2, 4),
-}
+import tilelang.language as T
+
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+torch.backends.cuda.matmul.allow_tf32 = False
+# torch.manual_seed(42)  # only enable when debugging
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(map_torch_type(in_dtype))
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
 
 
 def matmul_sp_sm90(
@@ -41,40 +45,32 @@ def matmul_sp_sm90(
     trans_A,
     trans_B,
 ):
-    E_factor = 4 if in_dtype == "float32" else 8
+    E_factor = 4 if in_dtype == T.float32 else 8
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), "uint8")
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                }
+            )
             T.disable_warp_group_reg_alloc()
-            T.clear(C_local)
+            T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
                 if trans_A:
@@ -85,8 +81,8 @@ def main(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm_sp(A_shared, E_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
 
     return main
 
@@ -107,34 +103,31 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    E_factor = 32 if is_8_bit else 16
+    metadata_dtype = T.int32 if is_8_bit else T.int16
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'int32' if is_8_bit else 'int16'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor),
-                                      'int32' if is_8_bit else 'int16')
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(E, mma_dtype="float16", backend="cutlass", arch="8.0"),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared, mma_dtype="float16", backend="cutlass", arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -181,19 +174,14 @@ def run_gemm_sp(
         kernel,
         out_idx=[-1],
     )
-    A = randn_semi_sparse(M, K, dtype=STR_TO_TYPE[in_dtype], device='cuda', transposed=trans_A)
-    if trans_B:
-        B = torch.randn((N, K), device='cuda', dtype=torch.float32)
-    else:
-        B = torch.randn((K, N), device='cuda', dtype=torch.float32)
-
-    if "float8" in in_dtype or "int8" in in_dtype:
-        A = normalize(A.float())
-        B = normalize(B.float())
-
-    A = A.to(STR_TO_TYPE[in_dtype])
-    B = B.to(STR_TO_TYPE[in_dtype])
-
+    A, B = generate_dense_input(
+        M=M,
+        N=N,
+        K=K,
+        trans_A=trans_A,
+        trans_B=trans_B,
+        in_dtype=in_dtype,
+    )
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
 
     C_sp = kernel(A_sparse, E, B)
@@ -206,17 +194,27 @@ def _matmul(A, B):
         if "float8" in in_dtype or "int8" in in_dtype:
             A = A.to(torch.float32)
             B = B.to(torch.float32)
-        return torch.matmul(A, B).to(STR_TO_TYPE[out_dtype])
+        return torch.matmul(A, B)
 
     C = _matmul(A, B)
-    if 'float8' in in_dtype:
+
+    if "float8" in in_dtype:
         diff = calc_diff(C_sp, C)
         assert diff < 1e-3, f"{diff=}"
     else:
-        torch.testing.assert_close(C_sp, C, atol=1e-3, rtol=1e-3)
+        torch_assert_close(
+            C_sp.to(torch.float32),
+            C.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-3,
+            base_name="tilelang_sp",
+            ref_name="ref_dense",
+        )
     print("pass")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def run_gemm_sp_sm90(
     M,
     N,
@@ -229,8 +227,8 @@ def run_gemm_sp_sm90(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm90(
         M,
@@ -260,6 +258,9 @@ def run_gemm_sp_sm90(
     )
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def run_gemm_sp_sm80(
     M,
     N,
@@ -272,8 +273,8 @@ def run_gemm_sp_sm80(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm80(
         M,
@@ -305,57 +306,51 @@ def run_gemm_sp_sm80(
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9, 0)
-def test_gemm_sp_sm90():
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     False)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     True)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 0, 256, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float8_e4m3fn, T.float16, T.float16, 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(8, 0)
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
-def test_gemm_sp_sm80():
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 32, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 1, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 3, 128, False, False),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 128, 128, 128, 0, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 1, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
new file mode 100644
index 000000000..3f3273b9a
--- /dev/null
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -0,0 +1,636 @@
+import pytest
+from tilelang import tvm as tvm
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+import tilelang.testing
+import torch
+import tilelang.language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_ss(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int32, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_shared, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 64, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_nullable_buffer_params.py b/testing/python/transform/test_nullable_buffer_params.py
new file mode 100644
index 000000000..e02c8125a
--- /dev/null
+++ b/testing/python/transform/test_nullable_buffer_params.py
@@ -0,0 +1,104 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_nullable_shared_shape():
+    """Test that buffers sharing a shape variable can be nullable."""
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), T.int32],
+            b: T.Tensor[(m,), T.int32],
+            c: T.Tensor[(m,), T.int32],
+        ):
+            with T.Kernel(1, threads=64):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return test_kernel
+
+    m = 200
+    kernel = get_kernel()
+
+    # Create test tensors
+    tensor_a = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_b = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_c = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+
+    print("Test 1: All tensors provided")
+    kernel(tensor_a, tensor_b, tensor_c)
+    print("✓ PASS: All tensors provided")
+
+    print("\nTest 2: Only first tensor provided")
+    kernel(tensor_a, None, None)
+    print("✓ PASS: Only first tensor provided")
+
+    print("\nTest 3: Only middle tensor provided")
+    kernel(None, tensor_b, None)
+    print("✓ PASS: Only middle tensor provided")
+
+    print("\nTest 4: Only last tensor provided")
+    kernel(None, None, tensor_c)
+    print("✓ PASS: Only last tensor provided")
+
+    print("\nTest 5: First and last tensors provided")
+    kernel(tensor_a, None, tensor_c)
+    print("✓ PASS: First and last tensors provided")
+
+    print("\nTest 6: All tensors are None (should fail)")
+    try:
+        kernel(None, None, None)
+        print("✗ FAIL: Should have raised an error")
+        return False
+    except RuntimeError as e:
+        if "at least one non-null buffer" in str(e):
+            print(f"✓ PASS: Correctly rejected with error: {e}")
+        else:
+            print(f"✗ FAIL: Wrong error message: {e}")
+            return False
+
+    print("\n" + "=" * 60)
+    print("All tests passed!")
+    return True
+
+
+def test_nullable_single_source_shape():
+    """Test that a single buffer with a symbolic shape var must be non-null.
+
+    This guards against the previous segfault when binding m from x.shape[0]
+    with x == None.
+    """
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def sample_kernel(x: T.Tensor[(m,), T.int32]):
+            with T.Kernel(1, threads=1):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return sample_kernel
+
+    m = 16
+    kernel = get_kernel()
+
+    # Provide a valid tensor: should run
+    x = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    kernel(x)
+
+    # Passing None should not segfault; m binds to 0 and kernel is a no-op
+    kernel(None)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_readonly_param_const_codegen.py b/testing/python/transform/test_readonly_param_const_codegen.py
new file mode 100644
index 000000000..0d255b46b
--- /dev/null
+++ b/testing/python/transform/test_readonly_param_const_codegen.py
@@ -0,0 +1,54 @@
+import tilelang.language as T
+from tilelang.engine.lower import lower
+from tilelang.jit.adapter.utils import match_declare_kernel
+
+
+def _simple_add_kernel():
+    @T.prim_func
+    def main(
+        x: T.Tensor((128,), T.float32),
+        y: T.Tensor((128,), T.float32),
+    ):
+        # One-dimensional kernel; writes y from x without modifying x
+        with T.Kernel(128, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    return main
+
+
+def test_codegen_emits_const_for_readonly_params():
+    # Lower without device compilation to retrieve CUDA source reliably
+    func = _simple_add_kernel()
+    artifact = lower(func, target="cuda", enable_device_compile=False)
+
+    src = artifact.kernel_source
+    print(src)
+    assert 'extern "C" __global__' in src
+
+    # Extract kernel signature and check qualifiers
+    lparen = match_declare_kernel(src)
+    rparen = src.find(")", lparen)
+    assert rparen != -1
+    signature = src[lparen:rparen]
+
+    # x is read-only: should be `const` and `__restrict__`
+    assert "const float* __restrict__" in signature
+    # y is written: must not be const, but still `__restrict__` due to noalias
+    # We ensure there is a non-const float* parameter with __restrict__ as well
+    assert "const float* __restrict__ x" in src or "const float *__restrict__ x" in src
+    assert " float* __restrict__ y" in src or " float *__restrict__ y" in src
+
+    # Also validate the function attribute carries read-only param indices
+    # Expect only the first handle parameter (x) to be marked read-only
+    device_mod = artifact.device_mod
+    prim_funcs = [f for f in device_mod.functions.values() if hasattr(f, "attrs")]
+    assert prim_funcs, "No PrimFunc found in device module"
+    pf = prim_funcs[0]
+    ro = pf.attrs.get("tl.readonly_param_indices")
+    assert ro is not None, "Expected tl.readonly_param_indices to be present"
+    ro_list = [int(i) for i in ro]
+    assert 0 in ro_list and 1 not in ro_list
+
+
+if __name__ == "__main__":
+    test_codegen_emits_const_for_readonly_params()
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index c0444043d..d8d439642 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -10,26 +10,19 @@ def _check(original, transformed):
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    mod = tl.transform.Simplify()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_trival_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
+    def before(A: T.Tensor((16, 1), T.float32), C: T.Tensor((16, 1), T.float32)):
         for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
-            for i in T.serial(
-                    0,
-                    1,
-                    annotations={
-                        "software_pipeline_stage": [0, 1],
-                        "software_pipeline_order": [0, 1]
-                    }):
+            for i in T.serial(0, 1, annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]}):
                 with T.block():
                     T.reads(A[tx, i])
                     T.writes(C[tx, i])
-                    B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                    B = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
                     with T.block():
                         T.reads(A[tx, i])
                         T.writes(B[tx, 0])
@@ -46,9 +39,6 @@ def expected(A_handle: T.handle, C_handle: T.handle):
         tx = T.launch_thread("threadIdx.x", 16)
         B = T.decl_buffer((2, 16, 1), scope="shared")
         B[0, tx, 0] = A[tx, 0] * T.float32(2.0)
-        for i in range(0):
-            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2.0)
-            C[tx, i] = B[i, tx, 0] + T.float32(1.0)
         C[tx, 0] = B[0, tx, 0] + T.float32(1.0)
 
     _check(before, expected)
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index 8029305ae..296c6ce94 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -21,14 +21,12 @@ def _check(original, transformed):
 
 
 def test_cluster_planning():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def before(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -41,13 +39,12 @@ def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def after(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index f051f0282..559b2ffb4 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -9,7 +9,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
 
     batch = T.int32(batch)
     heads = T.int32(heads)
@@ -19,12 +19,11 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    dtype = T.bfloat16
+    accum_dtype = T.float32
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -36,41 +35,42 @@ def MMA0(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def MMA1(
-                V: T.Tensor(shape, dtype),
-                V_shared: T.Tensor([block_M, dim], dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                k: T.int32,
-                by: T.int32,
-                bz: T.int32,
+            V: T.Tensor(shape, dtype),
+            V_shared: T.Tensor([block_M, dim], dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            k: T.int32,
+            by: T.int32,
+            bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.Tensor([block_M, block_N], accum_dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                scores_max: T.Tensor([block_M], accum_dtype),
-                scores_max_prev: T.Tensor([block_M], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
-                scores_sum: T.Tensor([block_M], accum_dtype),
-                logsum: T.Tensor([block_M], accum_dtype),
+            acc_s: T.Tensor([block_M, block_N], accum_dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            scores_max: T.Tensor([block_M], accum_dtype),
+            scores_max_prev: T.Tensor([block_M], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
+            scores_sum: T.Tensor([block_M], accum_dtype),
+            logsum: T.Tensor([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -90,22 +90,21 @@ def Softmax(
 
         @T.macro
         def Rescale(
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -120,7 +119,7 @@ def main(
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -129,19 +128,18 @@ def main(
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                            scores_sum, logsum)
+                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                     Rescale(acc_o, scores_scale)
                     MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
diff --git a/testing/python/transform/test_tilelang_transform_decouple_type_cast.py b/testing/python/transform/test_tilelang_transform_decouple_type_cast.py
new file mode 100644
index 000000000..fd9746e00
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_decouple_type_cast.py
@@ -0,0 +1,215 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang.transform import DecoupleTypeCast
+
+
+def _check(original, transformed):
+    """Apply DecoupleTypeCast pass and check IR matches expected output."""
+    mod = tvm.IRModule.from_expr(original.with_attr("global_symbol", "main"))
+    mod = DecoupleTypeCast()(mod)
+
+    transformed = tvm.IRModule.from_expr(transformed.with_attr("global_symbol", "main"))
+
+    tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
+
+
+def test_local_to_memory():
+    """Test local → memory: compute to cast buffer, then copy to memory."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        b_local_cast = T.decl_buffer((16,), T.float4_e2m1fn, scope="local")
+        for i in T.vectorized(16):
+            b_local_cast[i] = T.cast(b_frag[i], T.float4_e2m1fn)
+        for i_copy in T.vectorized(16):
+            b[i_copy] = b_local_cast[i_copy]
+
+    _check(before, after)
+
+
+def test_memory_to_local():
+    """Test memory → local: copy from memory to cast buffer, then compute."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        b_local_cast = T.decl_buffer((16,), T.float4_e2m1fn, scope="local")
+        for i in T.vectorized(16):
+            b_local_cast[i] = b_frag[i]
+        for i_copy in T.vectorized(16):
+            b[i_copy] = b_local_cast[i_copy]
+
+    _check(before, after)
+
+
+def test_no_transform_same_dtype():
+    """Test no transformation when dtypes are the same."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float32]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float32]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    _check(before, after)
+
+
+def test_no_transform_local_to_local():
+    """Test no transformation for local → local (both are local buffers)."""
+
+    @T.prim_func
+    def before():
+        a_frag = T.alloc_local((16,), T.float32)
+        b_frag = T.alloc_local((16,), T.float4_e2m1fn)
+        for i in T.vectorized(16):
+            b_frag[i] = a_frag[i]
+
+    @T.prim_func
+    def after():
+        a_frag = T.alloc_local((16,), T.float32)
+        b_frag = T.alloc_local((16,), T.float4_e2m1fn)
+        for i in T.vectorized(16):
+            b_frag[i] = T.cast(a_frag[i], T.float4_e2m1fn)
+
+    _check(before, after)
+
+
+def test_no_transform_if_then_else_condition():
+    """Test no transformation when different dtype is only in if_then_else condition.
+
+    The condition part of if_then_else doesn't participate in type casting,
+    so a global/shared buffer load with different dtype in condition should
+    not trigger cast buffer insertion.
+    """
+
+    @T.prim_func
+    def before(cond_buf: T.Tensor[(1,), T.int32]):
+        acc = T.alloc_local((8,), T.float32)
+        for i in T.vectorized(8):
+            # cond_buf is int32, acc is float32, but cond_buf is only in condition
+            acc[i] = T.if_then_else(cond_buf[0] > 0, acc[i] * 2.0, acc[i])
+
+    @T.prim_func
+    def after(cond_buf: T.Tensor[(1,), T.int32]):
+        acc = T.alloc_local((8,), T.float32)
+        for i in T.vectorized(8):
+            # Should remain unchanged - no cast buffer needed
+            acc[i] = T.if_then_else(cond_buf[0] > 0, acc[i] * T.float32(2), acc[i])
+
+    _check(before, after)
+
+
+# =============================================================================
+# CUDA Codegen Tests
+# =============================================================================
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_local_to_memory():
+    """Test CUDA codegen for local → memory with vectorized copy."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float4_e2m1fn")
+        with T.Kernel(1, threads=32):
+            b_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                b[i] = b_frag[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+    # Should have vectorized copy (fp4_e2_16_t is 16 fp4 elements = 64 bits)
+    assert "fp4_e2_16_t" in source, "Expected vectorized fp4 copy in generated code"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_memory_to_local():
+    """Test CUDA codegen for memory → local with vectorized copy."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float4_e2m1fn")
+        with T.Kernel(1, threads=32):
+            a_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                a_frag[i] = b[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_fp8_local_to_memory():
+    """Test CUDA codegen for fp8 local → memory."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float8_e4m3fn")
+        with T.Kernel(1, threads=32):
+            b_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                b[i] = b_frag[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+    # Should have fp8 conversion (uses __nv_cvt for fp8)
+    assert "fp8" in source and "cvt" in source, "Expected fp8 conversion"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_no_cast_buffer_same_dtype():
+    """Test no cast buffer when dtypes are the same."""
+
+    @tilelang.jit
+    def kernel_fn():
+        @T.prim_func
+        def kernel(b: T.Tensor[(16,), T.float32]):
+            with T.Kernel(1, threads=32):
+                b_frag = T.alloc_local((16,), T.float32)
+                for i in T.vectorized(16):
+                    b[i] = b_frag[i]
+
+        return kernel
+
+    kernel = kernel_fn()
+    source = kernel.get_kernel_source()
+
+    # Should NOT have local cast buffer when dtypes match
+    assert "local_cast" not in source, "Should not have cast buffer when dtypes match"
+
+
+if __name__ == "__main__":
+    test_no_transform_if_then_else_condition()
diff --git a/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py b/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py
new file mode 100644
index 000000000..dd85ecaa1
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py
@@ -0,0 +1,137 @@
+import tilelang
+import tilelang.language as T
+import torch
+import re
+import pytest
+import tilelang.testing
+from tilelang import tvm as tvm
+import tilelang as tl
+from tilelang.utils.target import determine_target
+
+
+@tilelang.jit
+def qwq(dtype=torch.float8_e4m3fn):
+    @T.prim_func
+    def main(
+        A: T.Tensor((32,), dtype),
+        B: T.Tensor((16,), dtype),
+        C: T.Tensor((8,), dtype),
+        D: T.Tensor((4,), dtype),
+        E: T.Tensor((2,), dtype),
+    ):
+        with T.Kernel(1, threads=32):
+            var = T.alloc_var(dtype, 1.0)
+            for i in T.vectorized(32):
+                A[i] = var
+            for i in T.vectorized(16):
+                B[i] = 13.5
+            for i in T.vectorized(8):
+                C[i] = 3.14
+            for i in T.vectorized(4):
+                D[i] = 2.72
+            for i in T.vectorized(2):
+                E[i] = 430
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.float8_e5m2, torch.float8_e8m0fnu, torch.float16])
+def test_hoist_broadcast(dtype):
+    kernel = qwq(dtype)
+    print(kernel.get_kernel_source())
+    matches = re.findall(r"(\w+) broadcast_var(_[0-9]+)? = \1", kernel.get_kernel_source())
+    assert len(matches) == 4
+    a = torch.empty((32,), device="cuda", dtype=dtype)
+    b = torch.empty((16,), device="cuda", dtype=dtype)
+    c = torch.empty((8,), device="cuda", dtype=dtype)
+    d = torch.empty((4,), device="cuda", dtype=dtype)
+    e = torch.empty((2,), device="cuda", dtype=dtype)
+    kernel(a, b, c, d, e)
+
+
+auto_target = tvm.target.Target(determine_target("auto"))
+
+
+def _check(original, transformed):
+    mod = tvm.IRModule.from_expr(original.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.HoistBroadcastValues()(mod)
+
+    transformed = tvm.IRModule.from_expr(transformed.with_attr("global_symbol", "main"))
+    transformed = tvm.tir.transform.BindTarget(auto_target)(transformed)
+
+    tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
+
+
+def test_transform_hoist():
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            A_shared[0:8] = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            A_shared[0:8] = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+
+    _check(before, after)
+
+
+def test_transform_hoist_let_stmt():
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            val: T.float8_e4m3fnx8 = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+            A_shared[0:8] = val
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            val: T.float8_e4m3fnx8 = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+            A_shared[0:8] = val
+
+    _check(before, after)
+
+
+def test_transform_hoist_let_stmt_with_nested_bufferstore_broadcasts():
+    """Test case for the bug where BufferStore in LetStmt body clears pending_defs.
+
+    This test validates that broadcasts hoisted from a LetStmt's value expression
+    are preserved even when the body contains a BufferStore with additional broadcasts.
+    """
+
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            # LetStmt value has broadcasts
+            val: T.float8_e4m3fnx8 = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+            # Body is a BufferStore with additional broadcasts
+            A_shared[0:8] = val + T.Broadcast(T.float8_e4m3fn(5.6), 8)
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            # Hoisted from LetStmt value
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            val: T.float8_e4m3fnx8 = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+            # Hoisted from BufferStore
+            broadcast_var_2: T.float8_e4m3fn = T.float8_e4m3fn(5.6)
+            A_shared[0:8] = val + T.Broadcast(broadcast_var_2, 8)
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 2859821ca..a815e8e32 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -22,47 +22,55 @@ def _check(original, transformed):
 
 
 def test_lower_fence_proxy():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     @T.prim_func
     def after():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.fence_proxy_async()
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     _check(before, after)
 
 
 def test_async_to_generic_no_double_fence():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
-            B_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
-            T.ptx_cp_async("uint8", A_shared.data, 0, B_shared.data, 0, 16)
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            T.ptx_cp_async(
+                T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+                T.tvm_access_ptr(T.type_annotation(T.uint8), B_shared.data, 0, 16, 1),
+                16,
+            )
             T.fence_proxy_async()
             T.call_extern("handle", "generic_op")
 
@@ -90,7 +98,6 @@ def visit(node):
 
 
 def test_proxy_hint_override():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -123,11 +130,10 @@ def visit(node):
 
 
 def test_tma_store_sync_injection():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_global = T.decl_buffer((128,), "float16", scope="global")
+            A_global = T.decl_buffer((128,), T.float16, scope="global")
             T.evaluate(T.call_intrin("handle", tir.op.Op.get("tl.tma_store"), A_global.data))
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
@@ -154,19 +160,33 @@ def visit(node):
 
 
 def test_wgmma_marked_async():
-
     @T.prim_func
     def before():
         with T.Kernel(1):
-            A_shared = T.decl_buffer((1,), "float16", scope="shared")
-            desc_a = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            desc_b = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            C_local = T.decl_buffer((32,), "float16", scope="local")
+            A_shared = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
             A_shared[0] = T.float16(0)
             T.warpgroup_arrive()
-            T.ptx_wgmma_ss("float16", "m64n64k16", T.bool(True), T.bool(True), "fp16", "fp16",
-                           "fp16", desc_a.data, T.int32(0), desc_b.data, T.int32(0), C_local.data,
-                           T.int32(0), T.bool(True), 1, 1)
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.BindTarget(auto_target)(mod)
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 95cbf2db5..ee1c39f94 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -9,7 +9,7 @@ def test_inject_set_max_nreg():
     """Test the InjectSetMaxNReg pass"""
 
     @T.prim_func
-    def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16")):
+    def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 128)
@@ -22,39 +22,38 @@ def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16"
             T.annotate_producer_reg_dealloc(24)  # Producer: decrease to 24
             T.annotate_consumer_reg_alloc(240)  # Consumer: increase to 240
 
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
 
-            T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
+            mbars = T.alloc_barrier([128, 128, 128, 128, 128, 128])
             T.attr([128, 128], "kWarpSpecializationScope", 0)
 
             if v >= 128:
                 # Producer branch - should have set_max_nreg(24, 0)
                 for k in range(16):
-                    T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
+                    T.mbarrier_wait_parity(mbars[k % 3 + 3], T.bitwise_xor(k // 3 % 2, 1))
                     if v - 128 == 0:
                         T.tma_load(
-                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1,
-                                                    0, 2, 2, 0), T.get_mbarrier(k % 3),
-                            T.tvm_access_ptr(
-                                T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                            k * 32, by * 64)
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                            mbars[k % 3],
+                            T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                            k * 32,
+                            by * 64,
+                        )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [mbars[k % 3]]))
             else:
                 # Consumer branch - should have set_max_nreg(240, 1)
                 for k in range(16):
-                    T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
+                    T.mbarrier_wait_parity(mbars[k % 3], k // 3 % 2)
                     T.call_extern(
-                        "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                        "handle",
+                        "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                    )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [mbars[k % 3 + 3]]))
 
     # Apply the InjectSetMaxNReg pass
     func = before
@@ -67,15 +66,18 @@ def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16"
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # We should have at least 2 set_max_nreg calls (one for producer, one for consumer)
-    assert len(set_max_nreg_calls
-              ) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg test passed!")
 
@@ -84,7 +86,7 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     """Test the InjectSetMaxNReg pass with no_set_max_nreg"""
 
     @T.prim_func
-    def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
+    def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         v = T.launch_thread("threadIdx.x", 128)
 
@@ -95,7 +97,7 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
             # Add no_set_max_nreg to disable register hinting
             T.disable_warp_group_reg_alloc()
 
-            T.create_list_of_mbarrier(128, 128)
+            mbars = T.alloc_barrier([128, 128])  # noqa: F841
             T.attr([128, 128], "kWarpSpecializationScope", 0)
 
             if v >= 128:
@@ -116,16 +118,18 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # Should have no set_max_nreg calls when no_set_max_nreg is present
-    assert len(
-        set_max_nreg_calls
-    ) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg with no_set_max_nreg test passed!")
 
diff --git a/testing/python/transform/test_tilelang_transform_layout_inference.py b/testing/python/transform/test_tilelang_transform_layout_inference.py
index 66415aacb..82fcd19ab 100644
--- a/testing/python/transform/test_tilelang_transform_layout_inference.py
+++ b/testing/python/transform/test_tilelang_transform_layout_inference.py
@@ -8,17 +8,21 @@
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
@@ -26,58 +30,62 @@ def main(B: T.Tensor((K, N), dtype),):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
                         for vec in T.Parallel(vec_load_b):
-                            B_shared[i * (threads * vec_load_b // block_N) + t //
-                                     (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                     (block_N // vec_load_b) + vec] = T.if_then_else(
-                                         k * block_K + i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b) < K and bx * block_N + t %
-                                         (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                         B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                           t // (block_N // vec_load_b), bx * block_N + t %
-                                           (block_N // vec_load_b) * (block_N // vec_load_b) + vec],
-                                         T.float16(0))
+                            B_shared[
+                                i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                            ] = T.if_then_else(
+                                k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                B[
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ],
+                                T.float16(0),
+                            )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.target.Target(auto_target):
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
@@ -94,4 +102,4 @@ def main(B: T.Tensor((K, N), dtype),):
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    test_loop_tail_split(64, 64, 32, 128, 8, "float16")
+    test_loop_tail_split(64, 64, 32, 128, 8, T.float16)
diff --git a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
new file mode 100644
index 000000000..26c151141
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
@@ -0,0 +1,342 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+def _check(original, expected):
+    """Helper function to verify structural equality after transformations"""
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LegalizeNegativeIndex()(mod)
+    expected = tvm.IRModule.from_expr(expected.with_attr("global_symbol", "main"))
+    tvm.ir.assert_structural_equal(mod["main"], expected["main"], True)
+
+
+def test_buffer_load_negative_index_legalized():
+    """
+    Test that negative indices are legalized by adding buffer extent.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[-1]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        value = A[1023]  # A[-1] becomes A[1023]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices - only negative ones are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        value = A[-1, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        value = A[1023, 10]  # A[-1, 10] becomes A[1023, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[-1, -2, -3]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[1023, 510, 253]  # -1+1024=1023, -2+512=510, -3+256=253
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[-i]
+            B[-i] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[1024 - i]
+            B[1024 - i] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_non_negative_index_unchanged():
+    """
+    Test that non-negative indices remain unchanged.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # No changes expected for non-negative indices
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_unknown_sign_index_warning():
+    """
+    Test that indices with unknown sign trigger warnings but are processed.
+    This test mainly checks that the pass doesn't crash on unknown signs.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        # Unknown sign indices should remain unchanged
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        value = A[T.Broadcast(1023, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        value = A[T.Ramp(1020, 1, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_nested_buffer_loads():
+    """
+    Test legalization with nested buffer load expressions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[-1, 10]
+        outer_val = A[inner_val.astype(T.int32), -2]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[1023, 10]
+        outer_val = A[inner_val.astype(T.int32), 510]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index():
+    """
+    Test negative indices in buffer store operations are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        A[-1] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        A[1023] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        A[-1, 10] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        A[1023, 10] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions for buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[-1, -2, -3] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[1023, 510, 253] = 42.0  # -1+1024=1023, -2+512=510, -3+256=253
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[-i] = i * 2.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[1024 - i] = i * 2.0
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized store operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        values = T.Broadcast(42.0, 4)
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        values = T.Broadcast(42.0, 4)
+        A[T.Broadcast(1023, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized store operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        values = T.Ramp(0.0, 1.0, 4)  # values: [0.0, 1.0, 2.0, 3.0]
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        values = T.Ramp(0.0, 1.0, 4)
+        A[T.Ramp(1020, 1, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_nested_in_condition():
+    """
+    Test negative index buffer store within conditional statements.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[-1] = 42.0
+        else:
+            A[-2] = 24.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[1023] = 42.0
+        else:
+            A[1022] = 24.0
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 5202ab647..37eb3482f 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -5,10 +5,12 @@
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -16,17 +18,18 @@ def main(A: T.Tensor((M, N), dtype=dtype),):
                 A_shared[tid, j] = A[tid + M_offset, j + N_offset]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
 
     return main, expected
 
@@ -38,45 +41,13 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
-def issue_1013_buggy_kernel():
-    # NOTE: This kernel is mainly to test some corner cases in boundary check
+def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
+    dtype = T.float32
 
-    num_tokens = T.dynamic('num_tokens')
-    num_threads = 128
-
-    @T.prim_func
-    def main(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += x[idx] == 2
-
-    # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
-    # and the padding value is not used. However, the current prover cannot handle this case.
-    # So for now the expected kernel is a if-else statement to check the boundary.
     @T.prim_func
-    def expected(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += T.Cast("int32",
-                                T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
-
-    return main, expected
-
-
-def vectorize_access_with_atmoic_add_legalize(M: int = 64,
-                                              N: int = 64,
-                                              M_offset: int = 2,
-                                              N_offset: int = 2):
-    dtype = "float32"
-
-    @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -85,21 +56,22 @@ def main(A: T.Tensor((M, N), dtype=dtype),):
                 T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
-                        T.call_extern("handle", "AtomicAdd", A[tid + M_offset, j + N_offset], 1)
+                        T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     return main, expected
 
@@ -112,20 +84,24 @@ def assert_vectorize_access_with_atmoic_add(M: int = 64, N: int = 64):
 
 
 def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
             for j in T.serial(N):
                 A[tid + M_offset, j + N_offset] = 1
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
-            T.writes(A[tid + M_offset, N_offset:N + N_offset])
+            T.writes(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
@@ -145,13 +121,6 @@ def test_vectorize_access():
     assert_vectorize_access(64, 64)
 
 
-def test_issue_1013():
-    func, expected = issue_1013_buggy_kernel()
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
-
-
 def test_vectorize_access_with_atmoic_add():
     assert_vectorize_access_with_atmoic_add(64, 64)
 
diff --git a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
index c95af8777..3cc7541cc 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
@@ -5,11 +5,13 @@
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64):
-    dtype = "float32"
+    dtype = T.float32
     vec_len = 8
 
     @T.prim_func
-    def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def main(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
@@ -18,7 +20,9 @@ def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
                     A_shared[tid, j, v] = A[tid, j, v]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def expected(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
index aa2638af1..e773e3fee 100644
--- a/testing/python/transform/test_tilelang_transform_let_inline.py
+++ b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -8,14 +8,12 @@ def _check(original, transformed):
     func = original
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.LetInline()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_let_binding():
-
     @T.prim_func
-    def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def before(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -24,7 +22,7 @@ def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32"
                     B[i, j] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def expected(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -34,16 +32,15 @@ def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float3
 
 
 def test_parallel_scope():
-
     @T.prim_func
-    def before(A: T.Tensor((128,), "float32")):
+    def before(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 value = T.float32(1.0)
                 A[i] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128,), "float32")):
+    def expected(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 A[i] = T.float32(1.0)
diff --git a/testing/python/transform/test_tilelang_transform_loop_unswitching.py b/testing/python/transform/test_tilelang_transform_loop_unswitching.py
new file mode 100644
index 000000000..8909a3d2e
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_loop_unswitching.py
@@ -0,0 +1,529 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang.transform import PassConfigKey
+
+
+def _check(original, transformed):
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LoopUnswitching()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), map_free_vars=True)
+
+
+def _check_with_config(original, transformed, config):
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    with tvm.transform.PassContext(config=config):
+        mod = tl.transform.LoopUnswitching()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), map_free_vars=True)
+
+
+def test_basic_hoist():
+    """Basic case: loop-invariant if should be hoisted outside the loop."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_with_else():
+    """Conservative: if with non-trivial else should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    _check(before, expected)
+
+
+def test_no_hoist_loop_variant():
+    """If condition depends on loop variable, should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            if i < 64:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if i < 64:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_no_hoist_reads_written_buffer():
+    """If condition reads a buffer written in the loop, should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            A[i] = T.float32(1.0)
+            if A[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            A[i] = T.float32(1.0)
+            if A[0] > 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_hoist_with_other_stmts():
+    """Conservative: if with other side-effecting statements should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_nested_loop_inner_invariant():
+    """Loop-invariant if should be hoisted to outermost possible level."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 128), T.float32),
+        B: T.Tensor((16, 128), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(16):
+            for j in range(128):
+                if cond[0] > 0:
+                    B[i, j] = A[i, j]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((16, 128), T.float32),
+        B: T.Tensor((16, 128), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # if is hoisted outside both loops since cond[0] is invariant to both
+        if cond[0] > 0:
+            for i in range(16):
+                for j in range(128):
+                    B[i, j] = A[i, j]
+        else:
+            for _i in range(16):
+                for _j in range(128):
+                    T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_parallel_loop():
+    """Loop-invariant if in parallel loop."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in T.Parallel(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in T.Parallel(128):
+                B[i] = A[i]
+        else:
+            for _i in T.Parallel(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_let_bound_variable():
+    """If condition uses a Let-bound variable, both should be hoisted together."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((1,), T.float32),
+    ):
+        for i in range(128):
+            pos = C[0]
+            if pos >= T.float32(0):
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((1,), T.float32),
+    ):
+        # Let binding is hoisted before the if, redundant inner LetStmt is removed
+        pos = C[0]
+        if pos >= T.float32(0):
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_multiple_let_bound_variables():
+    """If condition uses multiple Let-bound variables, all should be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((2,), T.float32),
+    ):
+        for i in range(128):
+            x = C[0]
+            y = C[1]
+            if x + y >= T.float32(0):
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((2,), T.float32),
+    ):
+        # Let bindings are hoisted before the if, redundant inner LetStmts are removed
+        x = C[0]
+        y = C[1]
+        if x + y >= T.float32(0):
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_multiple_identical_conditions():
+    """Multiple if statements with the same condition should all be replaced."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+                C[i] = A[i] * T.float32(2.0)
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_multiple_identical_conditions_with_else():
+    """Conservative: multiple if-else statements should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = T.float32(0)
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+            else:
+                C[i] = T.float32(1)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = T.float32(0)
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+            else:
+                C[i] = T.float32(1)
+
+    _check(before, expected)
+
+
+def test_no_hoist_let_bound_loop_variant():
+    """Let-bound variable depends on loop var, condition should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            idx = i % 2
+            if idx == 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged since idx depends on loop variable i
+        for i in range(128):
+            idx = i % 2
+            if idx == 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_no_hoist_multiple_let():
+    @tilelang.jit()
+    def get_fused_mapping_kernel(topk_idx: T.Tensor[(1,), T.int32]):
+        with T.Kernel():
+            _tmp1 = T.alloc_shared((1,), "int")
+            for i in T.serial(0, 4, 2):
+                _tmp2 = topk_idx[i]
+                T.assume(0 <= _tmp2 < 1)
+                if _tmp2 != -1:
+                    T.atomic_add(_tmp1[_tmp2], 1)
+
+    get_fused_mapping_kernel.compile()
+
+
+def test_no_hoist_thread_idx_predicate():
+    """Do not unswitch predicates that depend on threadIdx.
+
+    These predicates are loop-invariant, but hoisting them can split execution
+    across threads and break later synchronization insertion passes.
+    """
+
+    @T.prim_func
+    def before(A_ptr: T.handle, B_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (256,), dtype=T.int32)
+        B = T.match_buffer(B_ptr, (256,), dtype=T.int32)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for tx in T.thread_binding(256, thread="threadIdx.x"):
+                for i in T.unroll(0, 2):
+                    B[tx] = A[tx]
+                    if tx == 0:
+                        B[i] = T.int32(1)
+
+    _check(before, before)
+
+
+def test_hoist_with_else_when_enabled():
+    """Allow hoisting if-else when explicitly enabled."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for i in range(128):
+                B[i] = A[i] * T.float32(2.0)
+
+    _check_with_config(
+        before,
+        expected,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+def test_hoist_with_other_stmts_when_enabled():
+    """Allow hoisting when loop contains other side effects if enabled."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                C[i] = A[i]
+                B[i] = A[i]
+        else:
+            for i in range(128):
+                C[i] = A[i]
+                T.evaluate(0)
+
+    _check_with_config(
+        before,
+        expected,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+def test_no_hoist_thread_idx_predicate_even_when_enabled():
+    """The aggressive option must not unswitch per-thread predicates."""
+
+    @T.prim_func
+    def before(A_ptr: T.handle, B_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (256,), dtype=T.int32)
+        B = T.match_buffer(B_ptr, (256,), dtype=T.int32)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for tx in T.thread_binding(256, thread="threadIdx.x"):
+                for i in T.unroll(0, 2):
+                    B[tx] = A[tx]
+                    if tx == 0:
+                        B[i] = T.int32(1)
+
+    _check_with_config(
+        before,
+        before,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
index ca5042e0f..92c283dd7 100644
--- a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
+++ b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
@@ -24,31 +24,39 @@ def _check(original, transformed):
 
 
 def test_lower_hopper_intrin_barrier():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
             _ = T.launch_thread("threadIdx.x", 128)
-            T.create_list_of_mbarrier(128, 128, 128, 128)
+            T.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), 128, 128, 128, 128)
 
     @T.prim_func
     def after():
         with T.Kernel(8):
-            v_1 = T.launch_thread("threadIdx.x", 128)
-            T.evaluate(tir.Call("handle", "tir.create_barriers", [4]))
-            with T.If(v_1 == 0), T.Then():
+            _ = T.launch_thread("threadIdx.x", 128)
+            mbarrier = T.alloc_barrier([128, 128, 128, 128])  # noqa: F841
+            with T.If(tir.Call("bool", tir.op.Op.get("tl.tl_shuffle_elect"), [0])), T.Then():
                 T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(0), 128]))
+                    tir.Call(
+                        "handle", "tir.ptx_init_barrier_thread_count", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), 0), 128]
+                    )
+                )
                 T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(1), 128]))
+                    tir.Call(
+                        "handle", "tir.ptx_init_barrier_thread_count", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), 1), 128]
+                    )
+                )
                 T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(2), 128]))
+                    tir.Call(
+                        "handle", "tir.ptx_init_barrier_thread_count", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), 2), 128]
+                    )
+                )
                 T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(3), 128]))
+                    tir.Call(
+                        "handle", "tir.ptx_init_barrier_thread_count", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), 3), 128]
+                    )
+                )
+            T.evaluate(tir.Call("handle", tir.op.Op.get("tl.ptx_fence_barrier_init"), []))
             T.evaluate(tir.Call("handle", "tir.tvm_storage_sync", ["shared"]))
 
     _check(before, after)
diff --git a/testing/python/transform/test_tilelang_transform_lower_ldgstg.py b/testing/python/transform/test_tilelang_transform_lower_ldgstg.py
new file mode 100644
index 000000000..faea4e82f
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_ldgstg.py
@@ -0,0 +1,353 @@
+"""Tests for LowerLDGSTG pass that converts Ramp-based global memory
+load/store to ldg/stg intrinsics.
+
+Pass configurations:
+- tl.enable_lower_ldgstg: Enable non-predicated ldg/stg lowering (default: OFF)
+- tl.enable_lower_ldgstg_predicated: Enable predicated ldg/stg lowering (default: OFF)
+"""
+
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang.transform import PassConfigKey
+from tvm import tir
+
+
+def _apply_passes(mod, enable_non_predicated=False, enable_predicated=False):
+    """Apply the LowerLDGSTG pass and related lowering passes."""
+    mod = tvm.tir.transform.BindTarget(tvm.target.Target("cuda"))(mod)
+    mod = tl.transform.FlattenBuffer()(mod)
+    mod = tl.transform.VectorizeLoop()(mod)
+    with tvm.transform.PassContext(
+        config={
+            PassConfigKey.TL_ENABLE_LOWER_LDGSTG: enable_non_predicated,
+            PassConfigKey.TL_ENABLE_LOWER_LDGSTG_PREDICATED: enable_predicated,
+        }
+    ):
+        mod = tl.transform.LowerLDGSTG()(mod)
+    return mod
+
+
+def _check_has_intrinsic(mod, intrinsic_name):
+    """Check if the module contains a specific intrinsic call."""
+    found = [False]
+
+    def visitor(obj):
+        if isinstance(obj, tir.Call) and hasattr(obj.op, "name") and intrinsic_name in obj.op.name:
+            found[0] = True
+
+    tir.stmt_functor.post_order_visit(mod["main"].body, visitor)
+    return found[0]
+
+
+def test_lower_ldg32_default_off():
+    """Test that non-predicated ldg/stg lowering is OFF by default."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod)  # Default: enable_non_predicated=False
+    print("=== test_lower_ldg32_default_off ===")
+    print(mod)
+    # By default, non-predicated lowering is OFF
+    assert not _check_has_intrinsic(mod, "ldg32"), "Non-predicated ldg should be OFF by default"
+    assert not _check_has_intrinsic(mod, "stg32"), "Non-predicated stg should be OFF by default"
+
+
+def test_lower_ldg32_enabled():
+    """Test that ldg32/stg32 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg32_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg32"), "Expected ldg32 when enabled"
+    assert _check_has_intrinsic(mod, "stg32"), "Expected stg32 when enabled"
+
+
+def test_lower_ldg64_enabled():
+    """Test that ldg64/stg64 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(64, "threadIdx.x"):
+            for j in T.vectorized(2):
+                B[i * 2 + j] = A[i * 2 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg64_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg64"), "Expected ldg64 when enabled"
+    assert _check_has_intrinsic(mod, "stg64"), "Expected stg64 when enabled"
+
+
+def test_lower_ldg128_enabled():
+    """Test that ldg128/stg128 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg128_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected ldg128 when enabled"
+    assert _check_has_intrinsic(mod, "stg128"), "Expected stg128 when enabled"
+
+
+def test_lower_ldg256_enabled():
+    """Test that ldg256/stg256 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((256,), "float32"), B: T.Buffer((256,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(8):
+                B[i * 8 + j] = A[i * 8 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg256_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg256"), "Expected ldg256 when enabled"
+    assert _check_has_intrinsic(mod, "stg256"), "Expected stg256 when enabled"
+
+
+def test_lower_ldg32_predicated():
+    """Test predicated ldg32 for single element load."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            # Predicate doesn't depend on loop var, so it can be lowered
+            B[i] = T.if_then_else(pred > 0, A[i], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_ldg32_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg32"), "Expected predicated ldg32"
+
+
+def test_lower_stg32_predicated():
+    """Test predicated stg32 for single element store."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            # Predicate doesn't depend on loop var, so it can be lowered
+            with T.If(pred > 0), T.Then():
+                B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_stg32_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "stg32"), "Expected predicated stg32"
+
+
+def test_lower_ldg128_predicated():
+    """Test predicated ldg128 for vectorized load."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                # Predicate doesn't depend on vectorized loop var
+                B[i * 4 + j] = T.if_then_else(pred > 0, A[i * 4 + j], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_ldg128_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected predicated ldg128"
+
+
+def test_lower_stg128_predicated():
+    """Test predicated stg128 for vectorized store."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                # Predicate doesn't depend on vectorized loop var
+                with T.If(pred > 0), T.Then():
+                    B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_stg128_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "stg128"), "Expected predicated stg128"
+
+
+def test_predicated_store_with_load():
+    """Test that when a predicated store contains a load, the load also gets predicated.
+
+    This tests the pattern: if (pred) { B[i] = A[i] }
+    Both the store and the load should use predicated versions to avoid
+    out-of-bounds memory access when pred is false.
+    """
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                with T.If(pred > 0), T.Then():
+                    B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)
+    print("=== test_predicated_store_with_load ===")
+    print(mod)
+    # Both load and store should be predicated
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected predicated ldg128 for load inside predicated store"
+    assert _check_has_intrinsic(mod, "stg128"), "Expected predicated stg128"
+
+
+def test_predicated_disabled():
+    """Test that predicated lowering can be disabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), N: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                idx = i * 4 + j
+                B[idx] = T.if_then_else(idx < N, A[idx], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=False)
+    print("=== test_predicated_disabled ===")
+    print(mod)
+    # When disabled, no predicated ldg/stg should be generated
+    # This just verifies the configuration works
+
+
+def test_skip_async_scope():
+    """Test that loads in async scope are not lowered (will be cp.async)."""
+
+    @T.prim_func
+    def func(
+        A: T.Buffer((128,), "float32"),
+        B: T.Buffer((128,), "float32", scope="shared"),
+    ):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            with T.attr(0, "async_scope", 1):
+                for j in T.vectorized(4):
+                    B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_skip_async_scope ===")
+    print(mod)
+    # The load should NOT be lowered to ldg because it's in async scope
+    assert not _check_has_intrinsic(mod, "ldg"), "Loads in async scope should NOT be lowered to ldg"
+
+
+def test_non_cuda_target_skip():
+    """Test that the pass is skipped for non-CUDA targets."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                B[i * 4 + j] = A[i * 4 + j]
+
+    # Use a CPU target
+    cpu_target = tvm.target.Target("llvm")
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(cpu_target)(mod)
+    mod = tl.transform.FlattenBuffer()(mod)
+    mod = tl.transform.VectorizeLoop()(mod)
+    with tvm.transform.PassContext(config={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True}):
+        mod = tl.transform.LowerLDGSTG()(mod)
+    print("=== test_non_cuda_target_skip ===")
+    print(mod)
+    # The load should NOT be lowered to ldg because target is not CUDA
+    assert not _check_has_intrinsic(mod, "ldg"), "Non-CUDA targets should NOT use ldg intrinsics"
+    assert not _check_has_intrinsic(mod, "stg"), "Non-CUDA targets should NOT use stg intrinsics"
+
+
+@tilelang.testing.requires_cuda
+def test_e2e_load_global_store_global():
+    """End-to-end test that ldg/stg intrinsics work correctly when enabled."""
+    import torch
+
+    @tilelang.jit(pass_configs={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True})
+    def copy_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            for j in T.vectorized(4):
+                Y[pid * 4 + j] = X[pid * 4 + j]
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    copy_kernel(X, Y)
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+    # Verify codegen contains ldg/stg
+    src = copy_kernel.get_kernel_source(N=128)
+    print("=== Generated kernel source ===")
+    print(src)
+    assert "load_global_128" in src or "store_global_128" in src, "Expected load_global_128/store_global_128 in generated source"
+
+
+@tilelang.testing.requires_cuda
+def test_e2e_load_global_store_global_predicated():
+    """End-to-end test that load_global/store_global intrinsics work correctly when enabled."""
+    import torch
+
+    @tilelang.jit(pass_configs={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True, PassConfigKey.TL_ENABLE_LOWER_LDGSTG_PREDICATED: True})
+    def copy_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            for j in T.vectorized(4):
+                Y[pid * 4 + j] = T.if_then_else(pid < N // 8, X[pid * 4 + j], T.float32(0))
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    copy_kernel(X, Y)
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+    # Verify codegen contains load_global/store_global
+    src = copy_kernel.get_kernel_source(N=128)
+    print("=== Generated kernel source ===")
+    print(src)
+    assert "load_global_128_conditional" in src or "store_global_128_conditional" in src, (
+        "Expected load_global_128_conditional/store_global_128_conditional in generated source"
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py b/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py
new file mode 100644
index 000000000..3b622df76
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py
@@ -0,0 +1,47 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def matmul(M, N, K, block_M, block_N, block_K, mbars, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            barriers = T.alloc_barrier(mbars)  # noqa: F841
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def test_lower_shared_barrier():
+    mbars = (1, 1, 128, 128)  # list is unhashable so we use tuple here
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32, mbars=mbars)
+
+    assert f"uint64_t barriers_mem[{len(mbars)}];" in kernel.get_kernel_source()
+    assert "if (tl::tl_shuffle_elect<0>()) {" in kernel.get_kernel_source()
+    for i in range(len(mbars)):
+        assert f"barriers[{i}].init({mbars[i]});" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index 07dbd53f1..16c7cb802 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -8,63 +8,69 @@
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.transform.PassContext():
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
diff --git a/testing/python/transform/test_tilelang_transform_make_packed_api.py b/testing/python/transform/test_tilelang_transform_make_packed_api.py
index ff4487326..2508a9d12 100644
--- a/testing/python/transform/test_tilelang_transform_make_packed_api.py
+++ b/testing/python/transform/test_tilelang_transform_make_packed_api.py
@@ -80,7 +80,6 @@ def test_target_host_removed():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("cuda", host=host)})
@@ -102,7 +101,6 @@ def test_internal_subroutine_call():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"target": T.target("llvm", host="llvm")})
@@ -121,7 +119,8 @@ def subroutine(A_data: T.handle("float32")):
     subroutine_call_op = compute_scope.body.value.op
     assert isinstance(subroutine_call_op, tvm.ir.GlobalVar), (
         f"The main function's CallNode should use the subroutine's GLobalVar as the operation, "
-        f"but instead has an operation of type {subroutine_call_op}")
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 def test_subroutine_call_to_externally_visible_subroutine():
@@ -135,7 +134,6 @@ def test_subroutine_call_to_externally_visible_subroutine():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("llvm", host="llvm")})
@@ -154,11 +152,10 @@ def subroutine(A_data: T.handle("float32")):
     assert subroutine_compute_scope is not None
 
     subroutine_call_op = main_compute_scope.body.value.op
-    assert (
-        isinstance(subroutine_call_op, tvm.ir.Op) and
-        subroutine_call_op.name == "tir.tvm_call_cpacked"
-    ), (f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
-        f"but instead has an operation of type {subroutine_call_op}")
+    assert isinstance(subroutine_call_op, tvm.ir.Op) and subroutine_call_op.name == "tir.tvm_call_cpacked", (
+        f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 @tilelang.testing.requires_llvm
@@ -167,10 +164,10 @@ def test_function_call_with_wrong_argument_count():
 
     @T.prim_func
     def func(
-            A: T.Buffer([16, 16], "int32"),
-            B: T.Buffer([16, 16], "int32"),
-            C: T.Buffer([16, 16], "int32"),
-            D: T.Buffer([16, 16], "int32"),
+        A: T.Buffer([16, 16], "int32"),
+        B: T.Buffer([16, 16], "int32"),
+        C: T.Buffer([16, 16], "int32"),
+        D: T.Buffer([16, 16], "int32"),
     ):
         pass
 
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index ddb7f6662..e85fd8db8 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -24,14 +24,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_multi_version_buffer():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -40,8 +39,8 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -49,21 +48,27 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
@@ -73,8 +78,8 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -82,36 +87,37 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     _check(before, after)
 
 
 def test_multi_version_buffer_with_let():
-
     @T.prim_func
-    def before(scales: T.Tensor((4,), "float32")):
+    def before(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((8,), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((8,), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
@@ -120,10 +126,10 @@ def before(scales: T.Tensor((4,), "float32")):
                     accum[i] = accum[i] + shared[i]
 
     @T.prim_func
-    def after(scales: T.Tensor((4,), "float32")):
+    def after(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((2, 8), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((2, 8), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index b7448a204..83db7f75c 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -19,14 +19,12 @@ def _check(original, transformed):
 
 
 def test_simple_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -39,24 +37,22 @@ def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
             for ko in T.serial(
-                    32,
-                    annotations={
-                        "software_pipeline_async_stages": [T.int32(0)],
-                        "software_pipeline_order": [T.int32(0), T.int32(1),
-                                                    T.int32(2)],
-                        "software_pipeline_stage": [T.int32(3), T.int32(3),
-                                                    T.int32(3)]
-                    }):
+                32,
+                annotations={
+                    "software_pipeline_async_stages": [T.int32(0)],
+                    "software_pipeline_order": [T.int32(0), T.int32(1), T.int32(2)],
+                    "software_pipeline_stage": [T.int32(3), T.int32(3), T.int32(3)],
+                },
+            ):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index e1f4f9469..1bf08d40c 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -1,91 +1,605 @@
+# ruff: noqa
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
 import tilelang.testing
+from tilelang.transform import PassConfigKey
 
+from tvm import te
 
-def modify(
-    with_B: bool = False,
-    with_bias: bool = False,
-):
-
-    @T.prim_func
-    def main(
-            A: T.Tensor((64, 64)),
-            B: T.Tensor((64, 64)),
-            C: T.Tensor((64, 64)),
-            D: T.Tensor((64, 64)),
-            bias: T.Tensor((64, 64)),
-    ):
-        if with_B:
-            if with_bias:
-                T.gemm(A, bias, D)
-            T.gemm(A, B, D)
-        else:
-            with T.block():
-                A_shared = T.alloc_shared((64, 64), dtype="float32")
-                C_shared = T.alloc_shared((64, 64), dtype="float32")
-                D_shared = T.alloc_shared((64, 64), dtype="float32")
-                T.copy(A, A_shared)
-                T.copy(C, C_shared)
-                T.gemm(A_shared, C_shared, D_shared)
-                T.copy(D_shared, D)
-
-    return main
-
-
-def test_modify(with_B=False, with_bias=False):
-    tester = modify(with_B=with_B, with_bias=with_bias)
-    mod = tvm.IRModule({tester.attrs["global_symbol"]: tester})
-    mod2 = tl.transform.Simplify()(mod)
-    assert mod != mod2
-
-
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-        a: T.handle,
-        b: T.handle,
-        c: T.handle,
-    ):
-        A = T.match_buffer(a, (M, K), dtype=dtype)
-        B = T.match_buffer(b, (K, N), dtype=dtype)
-        C = T.match_buffer(c, (M, N), dtype=accum_dtype)
-
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def test_matmul():
-    func = matmul(1024, 1024, 1024, 128, 128, 32)
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    mod = tl.transform.Simplify()(mod)
-    kernel = tl.compile(mod["main"], out_idx=[2])
-
-    import torch
-    a = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
-    b = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
-    c = kernel(a, b)
-
-    ref_c = a @ b
-    ref_c = ref_c.float()
-    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-
-    # Get CUDA Source
-    print(kernel.get_kernel_source())
+
+def simplify_and_compare(before, expected, config=None):
+    """Helper function to run simplify pass and compare results."""
+    if config is None:
+        config = {}
+
+    full_config = {PassConfigKey.TL_SIMPLIFY.value: config}
+
+    with tvm.transform.PassContext(config=full_config):
+        after = tl.transform.Simplify()(before)
+
+    # Compare bodies only, ignoring function name differences
+    # Use map_free_vars=True to allow mapping of free variables (function parameters)
+    after_func = after["main"]
+    expected_func = expected["main"]
+    tvm.ir.assert_structural_equal(after_func.body, expected_func.body, map_free_vars=True)
+
+
+def test_stmt_simplify():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    with ib.for_range(0, n, name="i") as i, ib.if_scope(i < 12):
+        A[i] = C[i]
+
+    body = tvm.tir.LetStmt(n, 10, ib.get())
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body, tvm.tir.BufferStore)
+
+
+def test_thread_extent_simplify():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
+    ib.scope_attr(tx, "thread_extent", n)
+    ib.scope_attr(tx, "thread_extent", n)
+    ib.scope_attr(ty, "thread_extent", 1)
+    with ib.if_scope(tx + ty < 12):
+        A[tx] = C[tx + ty]
+    body = tvm.tir.LetStmt(n, 10, ib.get())
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body.body.body, tvm.tir.BufferStore)
+
+
+def test_if_likely():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
+    ib.scope_attr(tx, "thread_extent", 32)
+    ib.scope_attr(ty, "thread_extent", 32)
+    with ib.if_scope(ib.likely(tx * 32 + ty < n)), ib.if_scope(ib.likely(tx * 32 + ty < n)):
+        A[tx] = C[tx * 32 + ty]
+    body = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body.body, tvm.tir.IfThenElse)
+    assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
+
+
+def test_load_store_noop():
+    """Store of a value that was just read from the same location is a no-op."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float32")):
+        A[0] = A[0]
+
+    @T.prim_func
+    def expected(A: T.Buffer((1,), "float32")):
+        T.evaluate(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_load_store_noop_after_simplify():
+    """As test_load_store_noop, but requiring simplification to identify."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float32")):
+        A[0] = A[0] + (5.0 - 5.0)
+
+    @T.prim_func
+    def expected(A: T.Buffer((1,), "float32")):
+        T.evaluate(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_condition():
+    """Nested IfThenElse with the same condition can be simplified."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_provable_condition():
+    """Simplify inner conditional using constraint from outer."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i < 7:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_var_condition():
+    """Simplify inner conditional using constraint from outer."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i == n:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_altered_buffer_contents():
+    """No simplification of data-dependent conditionals."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "int32"), n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            if A[0] == n:
+                A[0] = 0
+
+    mod_before = tvm.IRModule({"main": before})
+    # Expected is the same as before
+    simplify_and_compare(mod_before, mod_before)
+
+
+def test_negation_of_condition():
+    """Use negation of outer condition to simplify inner."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i != 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_negation_of_not_equal():
+    """Test negation with != outer condition."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i != 5:
+                if i == 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i != 5:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_negation_of_var_condition():
+    """Test negation with dynamic condition."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i != n:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_literal_constraint_split_boolean_and():
+    """Split a boolean AND into independent constraints."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                if i == n:
+                    A[i, j] = 0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                A[i, j] = 0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_literal_constraint_split_boolean_or():
+    """Split a boolean OR into independent constraints."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                if i == n:
+                    A[i, j] = 1
+                else:
+                    A[i, j] = 2
+
+    @T.prim_func
+    def expected(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                A[i, j] = 2
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_if_then_else_expr():
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = T.if_then_else(i < 12, 1.0, 2.0, dtype="float32")
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 1.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_ceil_log2_int():
+    """Simplify expressions resulting from topi.math.ceil_log2"""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = T.cast(T.ceil(T.log2(T.cast(14, "float64"), dtype="float64"), dtype="float64"), dtype="int32")
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 4
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_left_shift_lower_bound():
+    """Integer bounds are propagated through left shift."""
+
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if T.shift_left(1, i, dtype="int32") >= 1:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_left_shift_upper_bound():
+    """Integer bounds are propagated through left shift."""
+
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if T.shift_left(31, i, dtype="int32") <= 1015808:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_conditional_floor_mod():
+    """A regression test for negative floormod denominator."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), i: T.int32):
+        if T.floormod(0 - i, 2) == 0:
+            A[0] = T.floormod(i, 2) == 0
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), i: T.int32):
+        if T.floormod(i, -2) == 0:
+            A[0] = True
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_simplify_rhs_of_boolean_and_using_lhs():
+    """Boolean expressions can introduce contexts."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5 and n < 10
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_lhs_of_boolean_and_using_rhs():
+    """Boolean expressions can introduce contexts for their arguments."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10 and n < 5
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_rhs_of_boolean_or_using_lhs():
+    """Boolean expressions can introduce contexts."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10 or n < 5
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_lhs_of_boolean_or_using_rhs():
+    """Boolean expressions can introduce contexts for their arguments."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5 or n < 10
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_conditional_using_buffer_value():
+    """Simplify a conditional using the known value in the buffer."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        if A[0] == 0:
+            A[0] = 42
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = 42
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_PROVE_CONDITIONAL.value: True})
+
+
+def test_simplify_non_conditional():
+    """Propagate a known value to later expressions."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: True})
+
+
+def test_suppress_simplify_non_conditional():
+    """Propagate a known value to later expressions - disabled."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    mod_before = tvm.IRModule({"main": before})
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: False})
+
+
+def test_simplify_buffer_store():
+    """Simplification using prior known."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 5
+        A[0] = A[0] + 7
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 5
+        A[0] = 12
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: True})
+
+
+def test_rewrite_as_and_of_ors():
+    """If enabled, rewrite boolean expressions into AND of OR."""
+
+    @T.prim_func
+    def before(A: T.Buffer(3, "bool")):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    @T.prim_func
+    def expected(A: T.Buffer(3, "bool")):
+        T.evaluate((A[0] or A[1]) and (A[0] or A[2]))
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS.value: True})
+
+
+def test_suppress_rewrite_as_and_of_ors():
+    """Only rewrite into AND of OR when allowed."""
+
+    @T.prim_func
+    def before(A: T.Buffer(3, "bool")):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    mod_before = tvm.IRModule({"main": before})
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS.value: False})
+
+
+def test_buffer_shape_constraint():
+    @T.prim_func
+    def before(a: T.handle):
+        n = T.int64()
+        A = T.match_buffer(a, (n * 32,), "float32")
+        A[T.min(T.int64(0), n)] = T.float32(0)
+
+    @T.prim_func
+    def expected(a: T.handle):
+        n = T.int64()
+        A = T.match_buffer(a, (n * 32,), "float32")
+        A[T.int64(0)] = T.float32(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_tilelang_enable_simplify_let_inline_true():
+    """Test that let statements are inlined when tilelang_enable_simplify_let_inline=True (default)."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            x = i + 1
+            A[i] = x
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            A[i] = i + 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    # Default behavior: let statements are inlined
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_ENABLE_LET_INLINE.value: True})
+
+
+def test_tilelang_enable_simplify_let_inline_false():
+    """Test that let statements are NOT inlined when tilelang_enable_simplify_let_inline=False."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            x = i + 1
+            A[i] = x
+
+    mod_before = tvm.IRModule({"main": before})
+    # When disabled, let statements should be preserved (before == after)
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_ENABLE_LET_INLINE.value: False})
 
 
 if __name__ == "__main__":
diff --git a/testing/python/transform/test_tilelang_transform_split_host_device.py b/testing/python/transform/test_tilelang_transform_split_host_device.py
new file mode 100644
index 000000000..b9a83e8b6
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_split_host_device.py
@@ -0,0 +1,253 @@
+# Copyright (c) Tile-AI Corporation. All Rights Reserved.
+"""Tests for SplitHostDevice pass."""
+# ruff: noqa
+
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tvm import tir
+
+
+def run_split_host_device_passes(func: tvm.tir.PrimFunc):
+    """Run the necessary passes before and including SplitHostDevice."""
+    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
+    mod = tvm.tir.transform.BindTarget(tvm.target.Target("cuda", "c"))(mod)
+    mod = tl.transform.InjectAssumes()(mod)
+    mod = tl.transform.AnnotateDeviceRegions()(mod)
+    mod = tl.transform.SplitHostDevice()(mod)
+    return mod
+
+
+def get_device_func(mod: tvm.IRModule):
+    """Get the device kernel function from the module."""
+    for gvar, func in mod.functions.items():
+        if "kernel" in gvar.name_hint:
+            return func
+    return None
+
+
+def get_host_func(mod: tvm.IRModule):
+    """Get the host function from the module."""
+    for gvar, func in mod.functions.items():
+        if "kernel" not in gvar.name_hint:
+            return func
+    return None
+
+
+def collect_assume_vars(func: tvm.tir.PrimFunc):
+    """Collect all variables used in assume statements."""
+    assume_vars = set()
+    in_assume = [False]  # Use list to allow mutation in nested function
+    assume_nodes = []
+
+    def collect_assumes(stmt):
+        if isinstance(stmt, tir.AttrStmt) and stmt.attr_key == "tl.assume":
+            assume_nodes.append(stmt.node)
+
+    tir.stmt_functor.post_order_visit(func.body, collect_assumes)
+
+    # Now collect variables from assume nodes
+    def collect_vars_from_expr(expr):
+        if isinstance(expr, tir.Var):
+            assume_vars.add(expr)
+
+    for node in assume_nodes:
+        tir.stmt_functor.post_order_visit(node, collect_vars_from_expr)
+
+    return assume_vars
+
+
+def get_var_name(var):
+    """Get the name of a Var, handling different TVM versions."""
+    if hasattr(var, "name_hint"):
+        return var.name_hint
+    elif hasattr(var, "name"):
+        return var.name
+    else:
+        # Try to get name from string representation
+        return str(var).split(":")[0].strip()
+
+
+def get_param_by_name(func: tvm.tir.PrimFunc, name: str):
+    """Get a parameter by name_hint."""
+    for param in func.params:
+        if get_var_name(param) == name:
+            return param
+    return None
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_with_user_assume():
+    """Test that user-defined assumes are correctly copied to device function
+    with proper variable substitution.
+
+    This test verifies that:
+    1. Assumes are copied from host to device function
+    2. Variables in assumes refer to the device function parameters (not dangling)
+    3. Host function correctly calls the kernel with original variables
+    """
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n >= 233 and n <= 1000)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(T.ceildiv(n - 233, 123)):
+                a[i] = 1
+
+    mod = run_split_host_device_passes(main)
+
+    # Check that we have both host and device functions
+    assert len(mod.functions) == 2, "Expected 2 functions (host and device)"
+
+    device_func = get_device_func(mod)
+    host_func = get_host_func(mod)
+
+    assert device_func is not None, "Device function not found"
+    assert host_func is not None, "Host function not found"
+
+    # Check that device function has assume statements
+    device_str = str(device_func)
+    assert "tl.assume" in device_str, "Device function should have assume statements"
+
+    # Check that assume variables are the same objects as function parameters
+    # (not dangling variables like n_1)
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+
+    assert param_n is not None, "Device function should have parameter 'n'"
+
+    # All 'n' variables in assumes should be the same object as the parameter
+    for var in assume_vars:
+        if get_var_name(var) == "n":
+            assert var.same_as(param_n), (
+                f"Assume variable 'n' (id={id(var)}) should be the same object "
+                f"as parameter 'n' (id={id(param_n)}). "
+                "This indicates a variable substitution bug in SplitHostDevice."
+            )
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_with_buffer_shape_assume():
+    """Test that buffer shape assumes (auto-generated) are correctly handled."""
+    n = T.dynamic("n")
+    m = T.dynamic("m")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n, m), T.float32]):
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                for j in T.serial(m):
+                    a[i, j] = 1.0
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    # Check that assumes exist
+    device_str = str(device_func)
+    assert "tl.assume" in device_str, "Device function should have assume statements"
+
+    # Check that assume variables match parameters
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+    param_m = get_param_by_name(device_func, "m")
+
+    for var in assume_vars:
+        if get_var_name(var) == "n" and param_n is not None:
+            assert var.same_as(param_n), "Assume 'n' should match parameter 'n'"
+        elif get_var_name(var) == "m" and param_m is not None:
+            assert var.same_as(param_m), "Assume 'm' should match parameter 'm'"
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_multiple_assumes():
+    """Test with multiple user assumes on the same variable."""
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n > 0)
+        T.assume(n < 10000)
+        T.assume(n % 128 == 0)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                a[i] = i
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    device_str = str(device_func)
+    # Should have multiple assume statements
+    assert device_str.count("tl.assume") >= 3, "Should have at least 3 assume statements"
+
+    # All assumes should use the parameter, not dangling variables
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+
+    assert param_n is not None
+    for var in assume_vars:
+        if get_var_name(var) == "n":
+            assert var.same_as(param_n), "All assume variables should match parameter"
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_no_dangling_vars():
+    """Verify that no dangling variable declarations (like n_1 = T.int32())
+    appear in the device function due to incorrect variable handling.
+    """
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n >= 100)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                a[i] = 1
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    device_str = str(device_func)
+
+    # Check for common patterns of dangling variables
+    # These patterns indicate that ConvertSSA created separate variables
+    # for assumes that should have used the function parameters
+    import re
+
+    # Look for patterns like "n_1 = T.int32()" which indicate dangling vars
+    dangling_pattern = r"\bn_\d+\s*=\s*T\.int32\(\)"
+    matches = re.findall(dangling_pattern, device_str)
+
+    # Filter out legitimate uses (like in blocks that might have their own scope)
+    # We're specifically looking for dangling declarations at function level
+    lines = device_str.split("\n")
+    dangling_decls = []
+    for line in lines:
+        # Check if this is a top-level dangling declaration
+        # (not inside a block's T.reads()/T.writes())
+        stripped = line.strip()
+        if re.match(r"^n_\d+\s*=\s*T\.int32\(\)$", stripped):
+            dangling_decls.append(stripped)
+
+    # If assume is immediately followed by a dangling var declaration, that's the bug
+    assume_indices = [i for i, line in enumerate(lines) if "tl.assume" in line]
+    for idx in assume_indices:
+        # Check if line before assume has dangling var
+        if idx > 0:
+            prev_line = lines[idx - 1].strip()
+            if re.match(r"^n_\d+\s*=\s*T\.int32\(\)$", prev_line):
+                raise AssertionError(
+                    f"Found dangling variable declaration '{prev_line}' before assume. "
+                    "This indicates SplitHostDevice did not properly substitute variables."
+                )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index c0b705567..8b2901571 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -3,7 +3,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 from tvm.script import tir as T
-from tvm import te
 
 
 def run_passes(func: tvm.tir.PrimFunc):
@@ -11,11 +10,7 @@ def run_passes(func: tvm.tir.PrimFunc):
 
     cuda_target = tvm.target.Target("cuda", host="llvm")
 
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
-        "global_symbol": "test",
-        "target": cuda_target
-    }))(
-        mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target}))(mod)
 
     mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
     mod = tvm.tir.transform.SplitHostDevice()(mod)
@@ -24,7 +19,6 @@ def run_passes(func: tvm.tir.PrimFunc):
 
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
-
     @T.prim_func(check_well_formed=False)
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -35,6 +29,8 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
         T.launch_thread(threadIdx_x, 4)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         result_local[0] = T.float32(0)
         if threadIdx_y < 8:
             temp_shared[threadIdx_x] = p0[0]
@@ -46,8 +42,29 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
 
 
 @tilelang.testing.requires_cuda
-def test_sync_read_thread_id_independent_location():
+def test_sync_if_with_same_index_with_modulo_if():
+    @T.prim_func(check_well_formed=False)
+    def func() -> None:
+        threadIdx_x = T.env_thread("threadIdx.x")
+        blockIdx_x = T.env_thread("blockIdx.x")
+        p0 = T.alloc_buffer([1], dtype="float32", scope="local")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        temp_shared = T.alloc_buffer([32], dtype="float32", scope="shared")
+        T.launch_thread(blockIdx_x, 1)
+        T.launch_thread(threadIdx_x, 32)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        if threadIdx_x % 4 == 0:
+            temp_shared[threadIdx_x] = p0[0]
+        result_local[0] = temp_shared[threadIdx_x]
+
+    mod = run_passes(func)
+    assert "T.tvm_storage_sync" in str(mod)
+
 
+@tilelang.testing.requires_cuda
+def test_sync_read_thread_id_independent_location():
     @T.prim_func
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -57,6 +74,8 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
         T.launch_thread(threadIdx_x, 4)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         result_local[0] = T.float32(0)
         if threadIdx_x < 1:
             temp_shared[0] = p0[0]
@@ -71,7 +90,6 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
 
 @tilelang.testing.requires_cuda
 def test_sync_shared():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 1)
@@ -79,6 +97,8 @@ def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         C = T.allocate([1], "float32", "local")
         D = T.allocate([16], "float32", "shared")
         threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         B_1 = T.Buffer((24,), data=B, scope="shared")
         A_1 = T.Buffer((16,), data=A.data)
         B_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
@@ -96,6 +116,8 @@ def expected(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         C_1 = T.allocate([1], "float32", "local")
         D_1 = T.allocate([16], "float32", "shared")
         threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         B_1_1 = T.Buffer((24,), data=B_1, scope="shared")
         A_1 = T.Buffer((16,), data=A.data)
         B_1_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
@@ -113,7 +135,6 @@ def expected(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
 
 @tvm.testing.requires_cuda
 def test_sync_let_stmt():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((16 * 512), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 16)
@@ -121,6 +142,8 @@ def func(A: T.Buffer((16 * 512), "float32")):
         in_thread_A_temp = T.allocate([1], "float32", "local")
         cross_thread_A_temp = T.allocate([1], "float32", "local")
         threadIdx_x = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         A_shared_1 = T.Buffer((512,), data=A_shared, scope="shared")
         for ax0 in range(512):
             A_shared_1[ax0] = A[blockIdx_x * 512 + ax0]
@@ -136,9 +159,9 @@ def func(A: T.Buffer((16 * 512), "float32")):
             in_thread_A_temp_1[0] = A_temp
         cross_thread_A_temp_1 = T.Buffer((1,), data=cross_thread_A_temp, scope="local")
         with T.attr(
-                T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
-                "reduce_scope",
-                T.reinterpret("handle", T.uint64(0)),
+            T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
+            "reduce_scope",
+            T.reinterpret(T.uint64(0), dtype="handle"),
         ):
             T.tvm_thread_allreduce(
                 T.uint32(1),
@@ -155,6 +178,8 @@ def expected(A: T.Buffer((8192,), "float32")):
         in_thread_A_temp_1 = T.allocate([1], "float32", "local")
         cross_thread_A_temp_1 = T.allocate([1], "float32", "local")
         threadIdx_x = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         A_shared_1_1 = T.Buffer((512,), data=A_shared_1, scope="shared")
         for ax0 in range(512):
             A_shared_1_1[ax0] = A[blockIdx_x * 512 + ax0]
@@ -172,7 +197,7 @@ def expected(A: T.Buffer((8192,), "float32")):
         T.attr(
             T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
             "reduce_scope",
-            T.reinterpret("handle", T.uint64(0)),
+            T.reinterpret(T.uint64(0), dtype="handle"),
         )
         cross_thread_A_temp_1_1 = T.Buffer((1,), data=cross_thread_A_temp_1, scope="local")
         T.tvm_thread_allreduce(
@@ -190,16 +215,21 @@ def expected(A: T.Buffer((8192,), "float32")):
 
 @tilelang.testing.requires_cuda
 def test_sync_shared_dyn_stmatrix_loop_hoist():
-
     @T.prim_func
     def func():
         buf_dyn_shmem = T.alloc_buffer((98304,), "uint8", scope="shared.dyn")
         tx = T.launch_thread("threadIdx.x", 384)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         for i in T.unroll(8):
             off = (
-                i // 4 * 8192 + tx // 32 * 1024 + tx % 16 * 64 +
-                (tx % 8 // 4 + i % 4 // 2) % 2 * 32 + (tx % 4 // 2 + i % 2) % 2 * 16 +
-                (tx % 32 // 16 + tx % 2) % 2 * 8)
+                i // 4 * 8192
+                + tx // 32 * 1024
+                + tx % 16 * 64
+                + (tx % 8 // 4 + i % 4 // 2) % 2 * 32
+                + (tx % 4 // 2 + i % 2) % 2 * 16
+                + (tx % 32 // 16 + tx % 2) % 2 * 8
+            )
             T.evaluate(
                 T.call_intrin(
                     "handle",
@@ -214,7 +244,8 @@ def func():
                         2,
                     ),
                     T.int32(2),
-                ))
+                )
+            )
 
     mod = tvm.IRModule({"main": func})
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
@@ -224,5 +255,349 @@ def func():
     assert s.index('T.tvm_storage_sync("shared.dyn")') < s.index("for i in T.unroll(8)")
 
 
+@tilelang.testing.requires_cuda
+def test_loop_carry_no_dependency_same_index():
+    """Test that A[i] write followed by A[i] read in a loop does NOT need barrier.
+
+    After iteration shift analysis:
+    - Iteration i writes A[i]
+    - Iteration i+1 reads A[i+1] (shifted from A[i])
+    - A[i] vs A[i+1] are disjoint, so no loop-carried dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Each iteration writes to A[tx], then reads from A[tx]
+            # No loop-carried dependency because different iterations
+            # access different locations
+            temp_shared[tx] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Should NOT have sync inside the loop since A[tx] in iteration i
+    # does not conflict with A[tx] in iteration i+1 (they're different threads' data)
+    # The key insight: same thread writes and reads its own location
+    assert 'T.tvm_storage_sync("shared")' not in s, f"Unexpected sync in loop:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_with_cross_thread_dependency():
+    """Test loop-carried dependency where different threads access overlapping locations.
+
+    In this test:
+    - Thread tx writes to A[tx]
+    - Then reads from A[(tx + 127) % 128] (neighbor's data from previous iteration)
+
+    After iteration shift analysis, we compare:
+    - Iteration i: thread tx writes A[tx]
+    - Iteration i+1: thread tx reads A[(tx + 127) % 128]
+
+    This creates a cross-thread dependency where thread tx+1's write conflicts
+    with thread tx's read in the next iteration, requiring a barrier.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Each thread writes to its own location
+            temp_shared[tx] = T.float32(i)
+            # Then reads from neighbor (creates cross-thread dependency)
+            result_local[0] = result_local[0] + temp_shared[(tx + 127) % 128]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Should have sync because thread tx reads from thread (tx+127)%128's location
+    # This is a WAR hazard across threads
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync for cross-thread dependency:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_modulo_buffering():
+    """Test that A[i%2] write followed by A[i%2] read does NOT need barrier (double buffering).
+
+    After iteration shift analysis:
+    - Iteration i writes A[i%2]
+    - Iteration i+1 reads A[(i+1)%2] (shifted from A[i%2])
+    - A[i%2] vs A[(i+1)%2] are disjoint (0 vs 1 or 1 vs 0), so no dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([2, 64], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 64)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Double buffering pattern: write to buffer[i%2], read from buffer[i%2]
+            # After shift: write buffer[i%2], read buffer[(i+1)%2]
+            # These are different buffers, so no conflict
+            temp_shared[i % 2, tx] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[i % 2, tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Should NOT have sync inside loop due to modulo buffering analysis
+    # Note: This test verifies the modulo analysis capability
+    print(f"Modulo buffering result:\n{s}")
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_different_indices():
+    """Test that A[i] write followed by A[i+1] read does NOT need barrier.
+
+    After iteration shift analysis:
+    - Iteration i writes A[i]
+    - Iteration i+1 reads A[i+2] (shifted from A[i+1], becomes A[(i+1)+1] = A[i+2])
+    - A[i] vs A[i+2] are disjoint, so no loop-carried dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 1)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Write to A[i], read from A[i+1]
+            # After shift: comparing A[i] (write) vs A[i+2] (read from i+1 shifted)
+            # No overlap, no dependency
+            temp_shared[i] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[i + 1]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    print(f"Different indices result:\n{s}")
+
+
+# =============================================================================
+# Tests for non-uniform if condition sync hoisting
+# =============================================================================
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_with_threadidx():
+    """Test that sync is hoisted when if condition directly depends on threadIdx.
+
+    When the if condition uses threadIdx, different threads may take different
+    branches. If a sync is needed inside the if, it must be hoisted to before
+    the if statement to avoid deadlock.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First, all threads write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Non-uniform condition: only some threads enter the if
+        if tx < 64:
+            # Inside the if, we read from shared memory
+            # This needs a sync, but since condition is non-uniform,
+            # the sync must be hoisted to before the if
+            result_local[0] = temp_shared[tx + 64]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Sync should appear before the if statement
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # The sync should be before the if, not inside it
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if tx < 64")
+    assert sync_pos < if_pos, f"Sync should be before if statement:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_shared_memory_condition():
+    """Test sync hoisting when if condition reads from shared memory with thread-dependent index.
+
+    This is the exact pattern that caused the original deadlock:
+    - Condition reads shared memory at index depending on threadIdx
+    - Different threads get different values -> non-uniform condition
+    - Sync inside if would cause deadlock
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        token_ids = T.alloc_buffer([128], dtype="int32", scope="shared")
+        data_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First phase: all threads write to data_shared
+        data_shared[tx] = T.float32(tx)
+        # Non-uniform condition: reads shared memory with threadIdx-dependent index
+        # token_ids[tx] can be different for each thread (e.g., some are -1, some are valid)
+        if token_ids[tx] != -1:
+            # Inside the if, we read from data_shared
+            # Sync is needed but must be hoisted because condition is non-uniform
+            result_local[0] = data_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Sync should appear before the if statement
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # The sync should be before the if that checks token_ids
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if token_ids")
+    assert sync_pos < if_pos, f"Sync should be hoisted before non-uniform if:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_inside_uniform_if_blockidx():
+    """Test that sync can stay inside if when condition is uniform (blockIdx).
+
+    When the if condition only depends on blockIdx (same for all threads in a block),
+    all threads take the same branch, so sync inside the if is safe.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 4)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First, all threads write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Uniform condition: blockIdx is same for all threads in a block
+        if bx < 2:
+            # Sync inside uniform if is safe - all threads in this block
+            # will either all enter or all skip this branch
+            result_local[0] = temp_shared[(tx + 64) % 128]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # Should have sync (either inside or outside the if is fine for uniform condition)
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_nested_non_uniform_if():
+    """Test sync hoisting with nested if statements where outer is non-uniform."""
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # Write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Outer non-uniform condition
+        if tx < 64:
+            # Inner condition (also non-uniform)
+            if tx < 32:
+                # Sync needed here must be hoisted all the way out
+                result_local[0] = temp_shared[tx + 64]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # Sync should be before the outermost non-uniform if
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if tx < 64")
+    assert sync_pos < if_pos, f"Sync should be hoisted before outer if:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_in_loop():
+    """Test sync hoisting when non-uniform if is inside a loop."""
+
+    @T.prim_func(private=True)
+    def func():
+        token_ids = T.alloc_buffer([128], dtype="int32", scope="shared")
+        data_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for k in range(2):
+            # Write to shared memory
+            data_shared[tx] = T.float32(tx + k)
+            # Non-uniform if inside loop
+            if token_ids[tx] != -1:
+                result_local[0] = result_local[0] + data_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # Sync should be before the if inside the loop, not inside the if
+    # This ensures all threads can reach the sync point
+
+
+@tilelang.testing.requires_cuda
+def test_no_sync_needed_uniform_accesses():
+    """Test that no extra sync is added when accesses are already safe.
+
+    When each thread only accesses its own data (no cross-thread dependency),
+    no sync is needed even inside an if statement.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        temp_local[0] = T.float32(tx)
+        # Non-uniform condition but no shared memory access
+        if tx < 64:
+            result_local[0] = temp_local[0]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod)
+    # No sync needed - only local memory is accessed
+    assert 'T.tvm_storage_sync("shared")' not in s, f"Unexpected sync:\n{s}"
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index 063ae2940..bcb98af9e 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -25,14 +25,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_warp_specialized():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -41,78 +40,103 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 256)
-        A_shared = T.decl_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-        B_shared = T.decl_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+        A_shared = T.decl_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+        B_shared = T.decl_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
         C_local = T.decl_buffer((32,), scope="local")
-        T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
+        T.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), 128, 128, 128, 128, 128, 128)
         T.attr([128, 128], "kWarpSpecializationScope", 0)
         if v >= 128:
             T.set_max_nreg(24, 0)
             for k in range(16):
-                T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
+                T.call_intrin(
+                    "handle",
+                    tir.op.Op.get("tl.mbarrier_wait_parity"),
+                    T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3 + 3),
+                    T.bitwise_xor(k // 3 % 2, 1),
+                )
                 if v - 128 == 0:
-                    T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
+                    T.call_intrin(
+                        "handle",
+                        tir.op.Op.get("tl.mbarrier_expect_tx"),
+                        T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3),
+                        4096,
+                    )
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v - 128 == 0:
-                    T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
+                    T.call_intrin(
+                        "handle",
+                        tir.op.Op.get("tl.mbarrier_expect_tx"),
+                        T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3),
+                        4096,
+                    )
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
-                T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
+                T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3)]))
         else:
             T.set_max_nreg(240, 1)
             for k in range(16):
-                T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
+                T.call_intrin(
+                    "handle",
+                    tir.op.Op.get("tl.mbarrier_wait_parity"),
+                    T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3),
+                    k // 3 % 2,
+                )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
                 T.evaluate(
-                    tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                    tir.Call("handle", "tir.ptx_arrive_barrier", [T.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), k % 3 + 3)])
+                )
 
     _check(before, after)
 
diff --git a/testing/python/utils/test_compress_utils.py b/testing/python/utils/test_compress_utils.py
index 1ec4cace8..e8fc20539 100644
--- a/testing/python/utils/test_compress_utils.py
+++ b/testing/python/utils/test_compress_utils.py
@@ -6,7 +6,7 @@
 
 
 def _test_compress_sm90(M, K, block_k, dtype):
-    A = randn_semi_sparse(M, K, dtype=dtype, device='cuda')
+    A = randn_semi_sparse(M, K, dtype=dtype, device="cuda")
     A_sparse, E = compress_sm90(A, block_k, False)
 
 
diff --git a/testing/python/webgpu/test_webgpu_codegen.py b/testing/python/webgpu/test_webgpu_codegen.py
index 0fe4f196d..b8b199e79 100644
--- a/testing/python/webgpu/test_webgpu_codegen.py
+++ b/testing/python/webgpu/test_webgpu_codegen.py
@@ -4,13 +4,12 @@
 import tilelang.language as T
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -39,8 +38,8 @@ def assert_gemm_codegen(
     block_M,
     block_N,
     block_K,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     func = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
     # Because the current pass context have been polluted by previous testing.
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index e4be01290..abd3d4e2b 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -1,11 +1,10 @@
-import sys
-import os
+import contextlib
 import ctypes
-
 import logging
+import os
+import sys
 import warnings
 from pathlib import Path
-from tqdm.auto import tqdm
 
 
 def _compute_version() -> str:
@@ -23,6 +22,7 @@ def _compute_version() -> str:
         if version_file.is_file():
             try:
                 from version_provider import dynamic_metadata  # type: ignore
+
                 return dynamic_metadata("version")
             except Exception:
                 # Fall back to the raw VERSION file if provider isn't available.
@@ -33,6 +33,7 @@ def _compute_version() -> str:
 
     try:
         from importlib.metadata import version as _dist_version  # py3.8+
+
         return _dist_version("tilelang")
     except Exception as exc:
         warnings.warn(
@@ -44,22 +45,10 @@ def _compute_version() -> str:
 
 
 __version__ = _compute_version()
+del _compute_version
 
 
-class TqdmLoggingHandler(logging.Handler):
-    """Custom logging handler that directs log output to tqdm progress bar to avoid interference."""
-
-    def __init__(self, level=logging.NOTSET):
-        """Initialize the handler with an optional log level."""
-        super().__init__(level)
-
-    def emit(self, record):
-        """Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted."""
-        try:
-            msg = self.format(record)
-            tqdm.write(msg)
-        except Exception:
-            self.handleError(record)
+logger = logging.getLogger(__name__)
 
 
 def set_log_level(level):
@@ -71,13 +60,32 @@ def set_log_level(level):
     """
     if isinstance(level, str):
         level = getattr(logging, level.upper(), logging.INFO)
-    logger = logging.getLogger(__name__)
     logger.setLevel(level)
 
 
 def _init_logger():
     """Initialize the logger specific for this module with custom settings and a Tqdm-based handler."""
-    logger = logging.getLogger(__name__)
+    try:
+        from tqdm.auto import tqdm
+    except ImportError:
+        tqdm = None
+
+    class TqdmLoggingHandler(logging.Handler):
+        """Custom logging handler that directs log output to tqdm progress bar to avoid interference."""
+
+        def __init__(self, level=logging.NOTSET):
+            """Initialize the handler with an optional log level."""
+            super().__init__(level)
+
+        def emit(self, record):
+            """Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted."""
+            try:
+                msg = self.format(record)
+                if tqdm is not None:
+                    tqdm.write(msg)
+            except Exception:
+                self.handleError(record)
+
     handler = TqdmLoggingHandler()
     formatter = logging.Formatter(
         fmt="%(asctime)s  [TileLang:%(name)s:%(levelname)s]: %(message)s",
@@ -89,61 +97,86 @@ def _init_logger():
     set_log_level("INFO")
 
 
-_init_logger()
+from .env import env as env  # noqa: F401
 
-logger = logging.getLogger(__name__)
+# Skip logger initialization in light import mode
+if not env.is_light_import():
+    _init_logger()
+
+del _init_logger
 
-from .env import enable_cache, disable_cache, is_cache_enabled  # noqa: F401
-from .env import env as env  # noqa: F401
 
-import tvm
-import tvm.base  # noqa: F401
-from tvm import DataType  # noqa: F401
-
-# Setup tvm search path before importing tvm
-from . import libinfo
-
-
-def _load_tile_lang_lib():
-    """Load Tile Lang lib"""
-    if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
-        for path in libinfo.get_dll_directories():
-            os.add_dll_directory(path)
-    # pylint: disable=protected-access
-    lib_name = "tilelang" if tvm.base._RUNTIME_ONLY else "tilelang_module"
-    # pylint: enable=protected-access
-    lib_path = libinfo.find_lib_path(lib_name)
-    return ctypes.CDLL(lib_path), lib_path
-
-
-# only load once here
-if env.SKIP_LOADING_TILELANG_SO == "0":
-    _LIB, _LIB_PATH = _load_tile_lang_lib()
-
-from .jit import jit, JITKernel, compile  # noqa: F401
-from .profiler import Profiler  # noqa: F401
-from .cache import clear_cache  # noqa: F401
-
-from .utils import (
-    TensorSupplyType,  # noqa: F401
-    deprecated,  # noqa: F401
-)
-from .layout import (
-    Layout,  # noqa: F401
-    Fragment,  # noqa: F401
-)
-from . import (
-    transform,  # noqa: F401
-    language,  # noqa: F401
-    engine,  # noqa: F401
-)
-from .autotuner import autotune  # noqa: F401
-from .transform import PassConfigKey  # noqa: F401
-
-from .engine import lower, register_cuda_postproc, register_hip_postproc  # noqa: F401
-
-from .math import *  # noqa: F403
-
-from . import ir  # noqa: F401
-
-from . import tileop  # noqa: F401
+@contextlib.contextmanager
+def _lazy_load_lib():
+    import torch  # noqa: F401 # preload torch to avoid dlopen errors
+
+    old_flags = sys.getdlopenflags()
+    old_init = ctypes.CDLL.__init__
+
+    def lazy_init(self, name, mode=ctypes.DEFAULT_MODE, *args, **kwargs):
+        return old_init(self, name, mode | os.RTLD_LAZY, *args, **kwargs)
+
+    sys.setdlopenflags(old_flags | os.RTLD_LAZY)
+    ctypes.CDLL.__init__ = lazy_init
+    try:
+        yield
+    finally:
+        sys.setdlopenflags(old_flags)
+        ctypes.CDLL.__init__ = old_init
+
+
+# Skip heavy imports in light import mode
+if not env.is_light_import():
+    with _lazy_load_lib():
+        from .env import enable_cache, disable_cache, is_cache_enabled  # noqa: F401
+
+        import tvm
+        import tvm.base  # noqa: F401
+        from tvm import DataType  # noqa: F401
+
+        # Setup tvm search path before importing tvm
+        from . import libinfo
+
+        def _load_tile_lang_lib():
+            """Load Tile Lang lib"""
+            if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
+                for path in libinfo.get_dll_directories():
+                    os.add_dll_directory(path)
+            # pylint: disable=protected-access
+            lib_name = "tilelang" if tvm.base._RUNTIME_ONLY else "tilelang_module"
+            # pylint: enable=protected-access
+            lib_path = libinfo.find_lib_path(lib_name)
+            return ctypes.CDLL(lib_path), lib_path
+
+        # only load once here
+        if env.SKIP_LOADING_TILELANG_SO == "0":
+            _LIB, _LIB_PATH = _load_tile_lang_lib()
+
+    from .jit import jit, JITKernel, compile, par_compile  # noqa: F401
+    from .profiler import Profiler  # noqa: F401
+    from .cache import clear_cache  # noqa: F401
+    from .utils import (
+        TensorSupplyType,  # noqa: F401
+        deprecated,  # noqa: F401
+        build_date,  # noqa: F401
+    )
+    from .layout import (
+        Layout,  # noqa: F401
+        Fragment,  # noqa: F401
+    )
+    from . import (
+        analysis,  # noqa: F401
+        transform,  # noqa: F401
+        language,  # noqa: F401
+        engine,  # noqa: F401
+        tools,  # noqa: F401
+    )
+    from .language import dtypes  # noqa: F401
+    from .autotuner import autotune  # noqa: F401
+    from .transform import PassConfigKey  # noqa: F401
+    from .engine import lower, register_cuda_postproc, register_hip_postproc, register_c_postproc  # noqa: F401
+    from .math import *  # noqa: F403
+    from . import ir  # noqa: F401
+    from . import tileop  # noqa: F401
+
+del _lazy_load_lib
diff --git a/tilelang/_typing.py b/tilelang/_typing.py
new file mode 100644
index 000000000..9c8aa47f6
--- /dev/null
+++ b/tilelang/_typing.py
@@ -0,0 +1,39 @@
+"""Type annotations for TileLang."""
+
+# NOTE(chaofan): We should name it "_typing.py" to avoid module shadowing with standard library "typing"
+# NOTE: In python 3.9, `from __future__ import annotations` does not for value expression, e.g. to define type alias
+
+# Python 3.9 compatibility
+try:
+    from typing import TypeAlias
+except ImportError:  # Python < 3.10
+    from typing_extensions import TypeAlias
+
+from typing import Union
+
+from tvm import ir
+from tvm import tir
+
+from tvm.tir import BufferLoad, BufferRegion
+from tilelang.dtypes import dtype
+
+# Barrier can only be a Buffer, a BufferLoad
+BarrierType: TypeAlias = Union[tir.Buffer, BufferLoad]
+
+# BufferLikeType can be a Buffer, a BufferLoad, a BufferRegion
+BufferLikeType: TypeAlias = Union[tir.Buffer, BufferLoad, BufferRegion]
+
+# This is for Python 3.9 compatibility.
+# In Python 3.9, we can only use isinstance(a, (TypeA, TypeB, ...)) instead of isinstance(a, TypeA | TypeB | ...))
+BufferLikeTypeTuple = (tir.Buffer, BufferLoad, BufferRegion)
+
+# Difference between "AnyDType" and "DType":
+# - AnyDType is a union of all possible types that can represent a data type, including torch.dtype
+# - DType is a more specific type alias that represents a data type in the context of TileLang, and must be
+#   adapted to string.
+DType: TypeAlias = Union[dtype, ir.Type, str, type]
+ShapeType: TypeAlias = Union[list[Union[tir.PrimExpr, int]], tuple[Union[tir.PrimExpr, int], ...]]
+
+# PrimExpr with adaptation to Python basic data types
+# IntImm, FloatImm, Bool: IntImm, Integer: IntImm
+PyPrimExpr: TypeAlias = Union[tir.PrimExpr, int, float, bool]
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
new file mode 100644
index 000000000..4e4090d80
--- /dev/null
+++ b/tilelang/analysis/__init__.py
@@ -0,0 +1,6 @@
+"""Tilelang IR analysis & visitors."""
+
+from .ast_printer import ASTPrinter  # noqa: F401
+from .nested_loop_checker import NestedLoopChecker  # noqa: F401
+from .fragment_loop_checker import FragmentLoopChecker  # noqa: F401
+from .layout_visual import LayoutVisual  # noqa: F401
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
new file mode 100644
index 000000000..453fd6966
--- /dev/null
+++ b/tilelang/analysis/ast_printer.py
@@ -0,0 +1,132 @@
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor, PrimFunc, Stmt
+
+from tvm.tir.transform import prim_func_pass
+
+
+_seq_field_key = "seq"
+_then_field_key = "then_case"
+_else_field_key = "else_case"
+_child_fields = ["body", "block", _seq_field_key, _then_field_key, _else_field_key]
+_ignore_fields = ["span"]
+
+_stmt_line_limit = 140
+_middle_connector = "├── "
+_last_connector = "└── "
+
+_normal_indent = " " * 4
+_seq_middle_indent = "|" + " " * 3
+
+
+@tir.functor.visitor
+class _ASTPrintVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.indent: list[str] = []
+
+    def print_with_clip(self, s: str) -> None:
+        if len(s) > _stmt_line_limit:
+            s = s[:_stmt_line_limit] + "..."
+        print("".join(self.indent) + s)
+
+    def print_stmt_brief(self, stmt: Stmt, prefix: str) -> None:
+        # stmt_script = repr(stmt).splitlines()[0].split("  ")[0].strip()
+        self.print_with_clip(prefix + f"{stmt.__class__.__name__}")
+
+    def visit_stmt(self, stmt: Stmt) -> None:
+        anno = stmt.__annotations__  # field_key_name -> field_key_type
+        field_keys = anno.keys()
+        # Filter out private/built-in fields.
+        normal_field_keys = [
+            key for key in field_keys if not key.startswith("_") and key not in _child_fields and key not in _ignore_fields
+        ]
+        child_field_keys = [key for key in field_keys if key in _child_fields]
+
+        for idx, key in enumerate(normal_field_keys):
+            value = getattr(stmt, key, None)
+            # Try to get its script representation.
+            value = repr(value)
+            # If has child fields, we enforce child fields to be last two fields.
+            # So all other fields are not last.
+            is_last = idx == len(normal_field_keys) - 1 and len(child_field_keys) == 0
+            # Add tree-like connector
+            connector = _last_connector if is_last else _middle_connector
+            self.print_with_clip(connector + f"{key}({anno[key]}): {value}")
+
+        # Handle child fields
+        # Here we have three cases:
+        # 1. SeqStmt, which has a list of child stmts.
+        # 2. IfThenElse w/ else condition, which has 2 child stmts.
+        # 3. Other stmts like For/Block, which has 1 child stmt.
+        if len(child_field_keys) == 2:
+            # Special output format for IfThenElse
+            try:
+                then_child = getattr(stmt, _then_field_key)
+                else_child = getattr(stmt, _else_field_key)
+            except Exception as e:
+                raise ValueError(
+                    "Unexpected error when printing AST: The node has two child fields but it violates IfElseNode structure."
+                ) from e
+            # Some IfElseNodes have no else branch, but they keep the else field and set the value to None.
+            has_else_branch = else_child is not None
+            # Visit then
+            prefix = (_middle_connector if has_else_branch else _last_connector) + f"{_then_field_key}(Stmt): "
+            self.print_stmt_brief(then_child, prefix)
+            self.indent.append(_seq_middle_indent)
+            self.visit_stmt(then_child)
+            self.indent.pop()
+            # Visit else
+            prefix = _last_connector + f"{_else_field_key}(Optional[Stmt]): "
+            self.print_stmt_brief(else_child, prefix)
+            if has_else_branch:
+                self.indent.append(_normal_indent)
+                self.visit_stmt(else_child)
+                self.indent.pop()
+        elif len(child_field_keys) == 1:
+            child_field_name = child_field_keys[0]
+            child = getattr(stmt, child_field_name)
+
+            # Special output format for SeqStmt
+            if child_field_name == _seq_field_key:
+                for i, child_node in enumerate(child):
+                    is_last_child = i == len(child) - 1
+                    prefix = (_last_connector if is_last_child else _middle_connector) + f"{_seq_field_key}{i}(Stmt): "
+                    self.print_stmt_brief(child_node, prefix)
+                    self.indent.append(_normal_indent if is_last_child else _seq_middle_indent)
+                    self.visit_stmt(child_node)
+                    self.indent.pop()
+            else:
+                # Other cases with only 1 child stmt
+                prefix = _last_connector + f"{child_field_name}(Stmt): "
+                self.print_stmt_brief(child, prefix)
+                self.indent.append(_normal_indent)
+                self.visit_stmt(child)
+                self.indent.pop()
+        else:
+            assert len(child_field_keys) == 0, "Unexpected error when printing AST: Got 3 or more child field keys."
+
+
+def ASTPrinter():
+    """
+    A visitor pass that renders the TileLang AST hierarchy in a visual tree format.
+
+    Comparing with TL script, this printer is more suitable for debugging
+    and understanding the internal structure of TensorIR, like the class structure of
+    each node and their connections.
+
+    This printer generates a human-readable, tree-structured representation of the
+    Abstract Syntax Tree (AST). It uses ASCII/Unicode connectors to visualize
+    parent-child relationships, making it easier to inspect nested structures
+    (e.g., loops, blocks, scopes) and verify compiler transformations.
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        print(f"PrimFunc(params={func.params}, ret_type={func.ret_type}, buffer_map={func.buffer_map}, attrs={func.attrs})")
+        func_body_prefix = _last_connector + "body="
+        visitor = _ASTPrintVisitor()
+        visitor.print_stmt_brief(func.body, func_body_prefix)
+        visitor.indent.append(_normal_indent)
+        visitor.visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
new file mode 100644
index 000000000..94900a5cc
--- /dev/null
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import post_order_visit
+
+
+@tir.functor.visitor
+class _LoopVarUseAnalyzer(PyStmtExprVisitor):
+    """Analyze whether a loop variable is used in the given expr."""
+
+    def __init__(self, var: Var) -> None:
+        super().__init__()
+        self.var = var
+        self.used = False
+
+    def visit_var_(self, op: Var) -> None:
+        if op == self.var:
+            self.used = True
+        # Don't recursively visit children to avoid infinite recursion
+
+
+def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
+    """
+    Collect local buffer accesses in the loop body.
+
+    Args:
+        statement: The TIR statement to analyze
+
+    Returns:
+        Tuple of buffer accesses in the loop body.
+    """
+
+    buffer_accesses = []
+
+    def visit_buffer_access(node):
+        if isinstance(node, (BufferLoad, BufferStore)) and node.buffer.scope().startswith("local"):
+            buffer_accesses.append(node)
+
+    post_order_visit(statement, visit_buffer_access)
+
+    return buffer_accesses
+
+
+@tir.functor.visitor
+class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            # Fuse consecutive parallel loops
+            # Other nested cases are all invalid in TileLang.
+            loops = [op]
+            child = op.body
+            while isinstance(child, For) and child.kind == tir.ForKind.PARALLEL:
+                loops.append(child)
+                child = child.body
+
+            loops_with_symbolic_ranges = []
+            for loop in loops:
+                if not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
+                    loops_with_symbolic_ranges.append(loop)
+
+            if len(loops_with_symbolic_ranges) > 0:
+                buffer_accesses = collect_local_buffer_accesses(child)
+            for loop in loops_with_symbolic_ranges:
+                for buffer_access in buffer_accesses:
+                    indices = buffer_access.indices
+                    analyzer = _LoopVarUseAnalyzer(loop.loop_var)
+                    for index in indices:
+                        analyzer.visit_expr(index)
+                    if analyzer.used:
+                        raise ValueError(
+                            "[Tilelang Semantic Check] "
+                            f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
+                            "a local/fragment buffer, which is not allowed in Tilelang."
+                        )
+
+            return
+
+        self.visit_stmt(op.body)
+
+
+def FragmentLoopChecker():
+    """
+    When using T.Parallel over a local/fragment buffer, there are several restrictions:
+    to ensure that the parallelization is valid.
+
+    1. The range of loop can not be symbolic.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _FragmentLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
new file mode 100644
index 000000000..141fb808c
--- /dev/null
+++ b/tilelang/analysis/layout_visual.py
@@ -0,0 +1,86 @@
+import tilelang.language as T
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor
+
+from tvm.tir.transform import prim_func_pass
+from tilelang.tools.plot_layout import plot_layout
+
+
+def print_fragment_format(layout: T.Fragment) -> str:
+    """
+    Format fragment layout information into a human-readable string.
+
+    Parameters
+    ----------
+    layout : T.Fragment
+        The fragment layout to format
+
+    Returns
+    -------
+    str
+        Formatted string showing shape, thread mapping, and index mapping
+    """
+    if isinstance(layout, T.Fragment):
+        input_shape = layout.get_input_shape()
+        output_shape = layout.get_output_shape()
+        lines = [f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}", f"  Index:  {layout.forward_index}"]
+        print("\n".join(lines))
+    else:
+        raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
+
+
+@tir.functor.visitor
+class _LayoutVisualVisitor(PyStmtExprVisitor):
+    """
+    User-friendly pass which visualizes fragment layouts inferred during compilation.
+
+    In TileLang, Fragment layouts describe:
+    - How logical indices (e.g., [i, j]) map to thread IDs
+    - How logical indices map to register file locations within each thread
+    - The shape transformation from input dimensions to output dimensions
+
+    This pass generates two types of output:
+    1. Textual output: A human-readable description printed to console
+    2. Visual diagrams: Color-coded plots saved to files (PDF, PNG, SVG formats)
+
+    Configuration:
+    The pass is controlled by the TL_ENABLE_LAYOUT_VISUALIZATION configuration option.
+    The configuration accepts string values:
+
+    - Empty string or not set: Pass does nothing (default, disabled)
+    - "png": Generate PNG format only (recommended for quick inspection)
+    - "pdf": Generate PDF format only (recommended for documentation)
+    - "svg": Generate SVG format only (recommended for web/vector graphics)
+    - "all": Generate all formats (PDF, PNG, SVG)
+    - "png,svg": Generate multiple formats (comma-separated)
+    """
+
+    def __init__(self, formats: list[str] = ""):
+        super().__init__()
+        self.layout_found = []
+        self.processed_layouts = set()
+        self.formats_list = [f for f in formats if f != "txt"]
+
+    def visit_block_(self, op: tir.Block) -> None:
+        if "layout_map" in op.annotations:
+            layout_map = op.annotations["layout_map"]
+
+            for key, layout in layout_map.items():
+                if isinstance(layout, T.Fragment):
+                    layout_id = str(layout)
+                    if layout_id not in self.processed_layouts:
+                        print(f"{key} inferenced layout:")
+                        print_fragment_format(layout)
+                        for fmt in self.formats_list:
+                            plot_layout(layout, name=f"{key}_layout", formats=fmt)
+                        self.processed_layouts.add(layout_id)
+
+        # super().visit_block_(op)
+
+
+def LayoutVisual(formats: str = ""):
+    def pass_fn(func: tir.PrimFunc, mod, ctx):
+        _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
new file mode 100644
index 000000000..96f24ed17
--- /dev/null
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -0,0 +1,119 @@
+from tvm import tir
+from tvm.tir import (
+    For,
+    Call,
+    PrimFunc,
+    PyStmtExprVisitor,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+def is_pipelined_for(op: For) -> bool:
+    """Check if a for loop is pipelined."""
+
+    anno_keys = ["num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_group"]
+    return any(key in op.annotations for key in anno_keys)
+
+
+def is_tile_op(op: Call) -> bool:
+    """Check if a call is a tile-op"""
+
+    return op.op.get_attr("TLOpBuilder") is not None
+
+
+@tir.functor.visitor
+class _NestedLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.in_parallel_context = False
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            child = op.body
+
+            # Special case: continuous nested parallel loop is allowed.
+            if isinstance(child, tir.For) and child.kind == tir.ForKind.PARALLEL:
+                self.visit_stmt(child)
+                return
+
+            # Otherwise
+            if self.in_parallel_context:
+                raise ValueError("[Tilelang Semantic Check] Nested parallel loops are not allowed. Please check your loop structure.")
+            self.in_parallel_context = True
+            super().visit_for_(op)
+            self.in_parallel_context = False
+            return
+        elif is_pipelined_for(op):
+            if self.in_parallel_context:
+                raise ValueError(
+                    "[Tilelang Semantic Check] Pipelined loop cannot be nested inside a parallel loop. Please check your loop structure."
+                )
+
+        super().visit_for_(op)
+
+    def visit_call_(self, op: Call) -> None:
+        if self.in_parallel_context and is_tile_op(op):
+            raise ValueError(
+                f'[Tilelang Semantic Check] Only elementwise operations are allowed inside a parallel loop. Got a tile-op "{op.op}".'
+            )
+
+
+def NestedLoopChecker():
+    """
+    User-friendly pass which identifies any invalid any nested-loop pattern.
+
+    Nested loops is an annoying problem in tilelang or other polyhedral-style compilers.
+    It contains many corner cases and undefined behaviours.
+
+    In tilelang, there are four loops:
+        T.serial
+        T.Parallel (T.vectorized)
+        T.Pipelined
+        T.Persistent
+
+    T.Persistent is a new feature which we do not consider here.
+
+    We define the following rules:
+    - (Rule 1) T.serial can be nested inside any other loop type without restriction.
+    - (Rule 2) Consecutive T.Parallel nested loops are not allowed. Including any TileOp (T.copy, etc.) which has
+        "parallel" behaviours is also forbidden.
+
+        Examples:
+        for i in T.Parallel(M):
+            stmt
+            for j in T.Parallel(N):
+                ...
+
+        for i in T.Parallel(M):
+            T.copy(A, B) # forbidden!
+
+        **Only a special case is allowed: strict continuous Parallel loops.** Since we can fuse them into a single T.Parallel loop.
+        Example:
+
+        for i in T.Parallel(M):
+                for j in T.Parallel(N):
+                    ... # allowed
+    - (Rule 3) T.Pipelined inside a T.Parallel is forbidden.
+
+        Examples:
+            for i in T.Parallel(M):
+                for j in T.Pipelined(K): # forbidden!
+                    ...
+
+            for i in T.Pipelined(K):
+                for j in T.Parallel(N): # allowed, ok
+                    ...
+
+    In summary, the problem mainly lies in the "T.Parallel". We highly recommend to use
+    T.Parallel to implement a tiled operator inside a kernel (e.g. T.gemm level) instead of other usages.
+    This guideline can help you avoid most of the issues.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _NestedLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/autodd.py b/tilelang/autodd.py
new file mode 100644
index 000000000..aea682fd2
--- /dev/null
+++ b/tilelang/autodd.py
@@ -0,0 +1,1171 @@
+from abc import ABC, abstractmethod
+import ast
+import asyncio
+from collections import Counter
+from copy import copy, deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+import shutil
+from typing import Callable, Literal, NamedTuple, override
+from collections.abc import Sequence
+from collections.abc import Iterable
+import contextlib
+import io
+import multiprocessing
+import queue
+import subprocess
+import tempfile
+import time
+import os
+import traceback
+
+
+def ast_replace(node: ast.AST, **changes) -> ast.AST:
+    node = copy(node)
+    for field, value in changes.items():
+        setattr(node, field, value)
+    return node
+
+
+def parse_stmts(s: str) -> list[ast.stmt]:
+    mod = ast.parse(s)
+    return mod.body
+
+
+def parse_expr(s: str) -> ast.expr:
+    mod = ast.parse(s, mode="eval")
+    return mod.body
+
+
+class ASTRewrite(ABC):
+    @abstractmethod
+    def get_name(self) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> "ast.AST | list[ast.AST] | None":
+        raise NotImplementedError
+
+
+@dataclass
+class GeneralRemove(ASTRewrite):
+    name: str
+    target_type: type[ast.AST]
+    inside_list: bool = True
+    replace_with: "ast.AST | list[ast.AST] | None" = None
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, self.target_type) and (not self.inside_list or inside_list)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> None:
+        return deepcopy(self.replace_with)
+
+
+def expr_to_zeros(target: ast.expr) -> ast.expr:
+    if isinstance(target, ast.Tuple):
+        zeros = [ast.Constant(value=0) for _ in target.elts]
+        return ast.Tuple(elts=zeros, ctx=ast.Load())
+    else:
+        return ast.Constant(value=0)
+
+
+class CallFwdArg1(ASTRewrite):
+    @override
+    def get_name(self) -> str:
+        return "call-fwd-arg1"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.Call) and len(node.args) >= 1
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.Call)
+        return node.args[0]
+
+
+class AttachFullFuncArgs(ASTRewrite):
+    @override
+    def get_name(self) -> str:
+        return "attach-full-func-args"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.FunctionDef) and (node.args.vararg is None or node.args.kwarg is None)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.FunctionDef)
+        node = copy(node)
+        node.args = copy(node.args)
+        if node.args.vararg is None:
+            node.args.vararg = ast.arg(arg="args")
+        if node.args.kwarg is None:
+            node.args.kwarg = ast.arg(arg="kwargs")
+        return node
+
+
+@dataclass
+class IntConstApply(ASTRewrite):
+    matcher: Callable[[int], bool]
+    apply: Callable[[int], ast.AST]
+    name: str
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.Constant) and isinstance(node.value, int) and self.matcher(node.value)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.Constant) and isinstance(node.value, int)
+        return ast_replace(node, value=self.apply(node.value))
+
+
+@dataclass
+class BinOpFwdArg(ASTRewrite):
+    forward: Literal["left", "right"] = "left"
+
+    @override
+    def get_name(self) -> str:
+        return f"binop-fwd-arg-{self.forward}"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.BinOp)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.BinOp)
+        if self.forward == "left":
+            return node.left
+        else:
+            return node.right
+
+
+def _as_expr_placeholder(temp: ast.AST) -> "str | None":
+    if isinstance(temp, ast.Name):
+        return temp.id
+    else:
+        return None
+
+
+def _as_stmt_placeholder(temp: ast.AST) -> "str | None":
+    if isinstance(temp, ast.Expr) and isinstance(temp.value, ast.Name):
+        return temp.value.id
+    else:
+        return None
+
+
+def _ast_match(temp: ast.AST, node: ast.expr, placeholders: set[str]):
+    ph_expr = _as_expr_placeholder(temp)
+    if ph_expr is not None and ph_expr in placeholders:
+        return {ph_expr: node}
+    if type(temp) is not type(node):
+        return False
+    result = {}
+    for field, value in ast.iter_fields(temp):
+        if isinstance(value, list):
+            if len(value) == 1:
+                ph_stmts = _as_stmt_placeholder(value[0])
+                if ph_stmts is not None and ph_stmts in placeholders:
+                    result.update({ph_stmts: getattr(node, field)})
+                    continue
+            if not isinstance(getattr(node, field), list):
+                return False
+            if len(value) != len(getattr(node, field)):
+                return False
+            for v1, v2 in zip(value, getattr(node, field)):
+                sub_result = _ast_match(v1, v2, placeholders)
+                if sub_result is False:
+                    return False
+                result.update(sub_result)
+        elif isinstance(value, ast.AST):
+            if not isinstance(getattr(node, field), ast.AST):
+                return False
+            sub_result = _ast_match(value, getattr(node, field), placeholders)
+            if sub_result is False:
+                return False
+            result.update(sub_result)
+        else:
+            if value != getattr(node, field):
+                return False
+    return result
+
+
+def _ast_replace(temp: ast.expr, repl: dict[str, ast.AST]) -> ast.expr:
+    ph_expr = _as_expr_placeholder(temp)
+    if ph_expr is not None and ph_expr in repl:
+        return deepcopy(repl[ph_expr])
+    ph_stmts = _as_stmt_placeholder(temp)
+    if ph_stmts is not None and ph_stmts in repl:
+        return deepcopy(repl[ph_stmts])
+    temp = copy(temp)
+    for field, value in ast.iter_fields(temp):
+        if isinstance(value, list):
+            if len(value) == 1:
+                ph_stmts = _as_stmt_placeholder(value[0])
+                if ph_stmts is not None and ph_stmts in repl:
+                    setattr(temp, field, deepcopy(repl[ph_stmts]))
+                    continue
+            new_values = []
+            for v in value:
+                res = _ast_replace(v, repl)
+                if res is None:
+                    continue
+                if isinstance(res, ast.AST):
+                    new_values.append(res)
+                else:
+                    new_values.extend(res)
+            setattr(temp, field, new_values)
+        elif isinstance(value, ast.AST):
+            setattr(temp, field, _ast_replace(value, repl))
+    return temp
+
+
+ASTPatKind = Literal["expr", "stmt"]
+
+
+@dataclass
+class ASTPat:
+    tree: "ast.expr | list[ast.stmt]"
+    placeholders: set[str]
+
+    @classmethod
+    def from_code(cls, kind: ASTPatKind, code: str, placeholders: set[str]) -> "ASTPat":
+        if kind == "expr":
+            tree = parse_expr(code)
+        elif kind == "stmt":
+            tree = parse_stmts(code)
+            if len(tree) == 1:
+                tree = tree[0]
+        else:
+            raise ValueError(f"Unknown AST pattern kind: {kind}")
+        return cls(tree, placeholders)
+
+    def match_placeholders(self, node: "ast.AST | list[ast.AST]") -> "dict[str, ast.AST] | bool":
+        return _ast_match(self.tree, node, self.placeholders)
+
+    def match(self, node: ast.AST) -> bool:
+        return self.match_placeholders(node) is not False
+
+    def replace(self, repl: dict[str, ast.AST]) -> ast.AST:
+        if isinstance(self.tree, list):
+            replaced_stmts = []
+            for stmt in self.tree:
+                replaced = _ast_replace(stmt, repl)
+                if isinstance(replaced, ast.AST):
+                    replaced_stmts.append(replaced)
+                else:
+                    replaced_stmts.extend(replaced)
+            return replaced_stmts
+        else:
+            return _ast_replace(self.tree, repl)
+
+
+@dataclass
+class ASTPatRewrite(ASTRewrite):
+    name: str
+    match_pat: ASTPat
+    rewrite_pat: ASTPat
+    checker: "Callable[[dict[str, ast.AST]], bool] | dict[str, Callable[[ast.AST], bool]] | None" = None
+    derived: "dict[str, Callable[[dict[str, ast.AST]], ast.AST]] | None" = None
+
+    @classmethod
+    def from_code(
+        cls,
+        name: str,
+        kind: ASTPatKind,
+        match: str,
+        rewrite: str,
+        placeholders: set[str],
+        checker: "Callable[[dict[str, ast.AST]], bool] | dict[str, Callable[[ast.AST], bool]] | None" = None,
+        derived: "dict[str, Callable[[dict[str, ast.AST]], ast.AST]] | None" = None,
+    ) -> "ASTPatRewrite":
+        match_pat = ASTPat.from_code(kind, match, placeholders)
+        rewrite_pat = ASTPat.from_code(kind, rewrite, placeholders)
+        return cls(name, match_pat, rewrite_pat, checker, derived)
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    def match_placeholders(self, node: ast.AST):
+        ph = self.match_pat.match_placeholders(node)
+        if ph is False:
+            return False
+        if self.derived is not None:
+            for k, v in self.derived.items():
+                ph[k] = v(ph)
+        if self.checker is not None:
+            if isinstance(self.checker, dict):
+                for k, v in self.checker.items():
+                    if k not in ph or not v(ph[k]):
+                        return False
+            else:
+                return self.checker(ph)
+        return ph
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return self.match_placeholders(node) is not False
+
+    def _rewrite(self, node: ast.AST):
+        # this function is for debugging purpose
+        repl = self.match_placeholders(node)
+        assert repl is not False
+        replaced = self.rewrite_pat.replace(repl)
+        return replaced
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        return self._rewrite(node)
+
+
+class ASTMutator:
+    def generic_visit(self, node):
+        for field, old_value in ast.iter_fields(copy(node)):
+            if isinstance(old_value, list):
+                new_values = []
+                for value in old_value:
+                    if isinstance(value, ast.AST):
+                        value = self.visit(value, node, field, True)
+                        if value is None:
+                            continue
+                        elif not isinstance(value, ast.AST):
+                            new_values.extend(value)
+                            continue
+                    new_values.append(value)
+                old_value[:] = new_values
+            elif isinstance(old_value, ast.AST):
+                new_node = self.visit(old_value, node, field, False)
+                if new_node is None:
+                    delattr(node, field)
+                else:
+                    setattr(node, field, new_node)
+        return node
+
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        return self.generic_visit(node)
+
+
+@dataclass
+class LabeledRewrite:
+    label: int
+    rewrite: ASTRewrite
+
+
+class RewriteAttacher(ASTMutator):
+    def __init__(self, rewrites: list[ASTRewrite]):
+        self.rewrites = rewrites
+        self.uid_counter = 0
+        self.rewrite_counter = 0
+        self.rewrite_names = Counter()
+
+    @override
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        node = copy(node)
+        node._dd_uid = self.uid_counter
+        self.uid_counter += 1
+        node._dd_rewrites = []
+        for r in self.rewrites:
+            if r.match(node, parent, field, inside_list):
+                lr = LabeledRewrite(self.rewrite_counter, r)
+                self.rewrite_counter += 1
+                self.rewrite_names[lr.rewrite.get_name()] += 1
+                node._dd_rewrites.append(lr)
+        res = self.generic_visit(node)
+        return res
+
+
+def attach_rewrites(tree: ast.AST, rewrites: list[ASTRewrite]) -> tuple[ast.AST, int, int]:
+    attacher = RewriteAttacher(rewrites)
+    new_tree = attacher.visit(tree, None, None, False)
+    print("Rewrites:", attacher.rewrite_names)
+    return new_tree, attacher.uid_counter, attacher.rewrite_counter
+
+
+class RewriteApplier(ASTMutator):
+    def __init__(self, target_labels: set[int]):
+        self.target_labels = target_labels
+        self.applied_rewrites: set[int] = set()
+        self.visited: set[int] = set()
+
+    @override
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        orig_uid = getattr(node, "_dd_uid", None)
+        if orig_uid in self.visited:
+            return self.generic_visit(node)
+        self.visited.add(orig_uid)
+
+        node = copy(node)
+        for lr in getattr(node, "_dd_rewrites", []):
+            lr: LabeledRewrite
+            if lr.label in self.target_labels:
+                node = lr.rewrite.rewrite(node, parent, field, inside_list)
+                self.applied_rewrites.add(lr.label)
+                break
+
+        if node is None:
+            return None
+        elif isinstance(node, ast.AST):
+            # After rewriting this node, traverse its children without
+            # re-applying rewrite selection logic to the node itself.
+            return self.generic_visit(node)
+        else:
+            new_items = []
+            for item in node:
+                if isinstance(item, ast.AST):
+                    res = self.visit(item, parent, field, inside_list)
+                    if res is None:
+                        continue
+                    elif isinstance(res, ast.AST):
+                        new_items.append(res)
+                    else:
+                        new_items.extend(res)
+            return new_items
+
+
+def apply_rewrites(tree: ast.AST, target_labels: set[int]) -> tuple[ast.AST, set[int]]:
+    applier = RewriteApplier(target_labels)
+    new_tree = applier.visit(deepcopy(tree), None, None, False)
+    return new_tree, applier.applied_rewrites
+
+
+def test_rewrite(rewrite: ASTRewrite, code: str):
+    tree = ast.parse(code)
+    tree, _, num_matched = attach_rewrites(tree, [rewrite])
+    tree, _ = apply_rewrites(tree, set(i for i in range(num_matched)))
+    ast.fix_missing_locations(tree)
+    return ast.unparse(tree)
+
+
+@dataclass
+class Task:
+    source: str
+    applied: list[int]
+    masked: list[int]
+
+    def with_source(self, source: str) -> "Task":
+        return Task(source, self.applied, self.masked)
+
+
+class PDD:
+    def __init__(self, all_labels: list[int], init_proba: float = 0.93):
+        self.all_labels = all_labels
+        self.probas = {label: init_proba for label in all_labels}
+
+    def apply(self, target_labels: set[int]) -> set[int]:
+        return target_labels
+
+    @staticmethod
+    def _update_probas(probas: dict[int, float], task: Task, is_interesting: bool):
+        if is_interesting:
+            for label in task.applied:
+                probas[label] = 1.0
+            for label in task.masked:
+                probas[label] = 0.0
+        else:
+            prod = 1.0
+            for label in task.applied:
+                if probas[label] > 0:
+                    prod *= probas[label]
+            denorm = 1.0 - prod
+            for label in task.applied:
+                p = probas[label]
+                if p >= 1.0:
+                    continue
+                probas[label] = 1.0 - (1.0 - p) / denorm if denorm > 0.0 else 0.0
+
+    def generator(self) -> Iterable[Task]:
+        probas = deepcopy(self.probas)
+        while True:
+            choices = sorted(probas.items(), key=lambda x: (x[1], x[0]), reverse=True)
+            selected = []
+            selected_count, prod = 0.0, 1.0
+            for label, p in choices:
+                if p >= 1.0:
+                    selected.append(label)
+                    continue
+                if (selected_count + 1) * prod * p > selected_count * prod:
+                    selected.append(label)
+                    selected_count, prod = selected_count + 1, prod * p
+                else:
+                    break
+            applied = self.apply(set(selected))
+            masked = set(selected).difference(applied)
+            task = Task(source=None, applied=list(applied), masked=list(masked))
+            if selected_count * prod == 0 or all(probas[label] >= 1.0 for label in applied):
+                break
+            yield deepcopy(task)
+            self._update_probas(probas, task, is_interesting=False)
+
+    def update(self, task: Task, is_interesting: bool):
+        self._update_probas(self.probas, task, is_interesting)
+
+
+class TaskManager(ABC):
+    @abstractmethod
+    def task_generator(self) -> Iterable[Task]: ...
+
+    @abstractmethod
+    def task_update(self, task: Task, is_interesting: bool): ...
+
+    @classmethod
+    @abstractmethod
+    def from_source(cls, source: str, *args, **kwargs) -> "TaskManager": ...
+
+
+class ASTPDD(TaskManager, PDD):
+    def __init__(self, tree: ast.AST, rewrites: list[ASTRewrite], init_proba: float = 0.93):
+        self.tree, _, total_rewrites = attach_rewrites(tree, rewrites)
+        all_labels = [i for i in range(total_rewrites)]
+        super().__init__(all_labels, init_proba)
+
+    @override
+    @classmethod
+    def from_source(cls, source, *args, **kwargs):
+        return cls(ast.parse(source), *args, **kwargs)
+
+    def apply(self, target_labels: set[int]) -> set[int]:
+        _, applied = apply_rewrites(self.tree, target_labels)
+        return applied
+
+    @override
+    def task_generator(self) -> Iterable[Task]:
+        for task in self.generator():
+            new_tree, _ = apply_rewrites(self.tree, task.applied)
+            try:
+                new_tree = deepcopy(new_tree)
+                ast.fix_missing_locations(new_tree)
+                source = ast.unparse(new_tree)
+            except Exception as _:
+                continue
+            yield task.with_source(source)
+            # self.update(task, is_interesting=False)
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        self.update(task, is_interesting)
+
+
+def ruff_fix_code(code_string: str, fix_lint: bool = True, format_code: bool = True) -> str:
+    ruff_executable = shutil.which("ruff")
+    if not ruff_executable:
+        raise FileNotFoundError("Unable to find ruff")
+
+    with tempfile.NamedTemporaryFile(mode="w+", suffix=".py", delete=False, encoding="utf-8") as tmp:
+        tmp.write(code_string)
+        tmp_path = tmp.name
+
+    try:
+        if fix_lint:
+            print("Running ruff fix on:", tmp_path)
+            subprocess.run([ruff_executable, "check", "--fix", "--unsafe-fixes", tmp_path], capture_output=True, check=False)
+
+        if format_code:
+            print("Running ruff format on:", tmp_path)
+            subprocess.run([ruff_executable, "format", tmp_path], capture_output=True, check=False)
+
+        with open(tmp_path) as f:
+            fixed_code = f.read()
+
+        return fixed_code
+
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+
+
+class LinePDD(TaskManager, PDD):
+    def __init__(self, source: str, init_proba: float = 0.93):
+        lines = [line for line in source.splitlines() if line.strip() != ""]
+        self.lines = lines
+        all_labels = [i for i in range(len(lines))]
+        super().__init__(all_labels, init_proba)
+
+    @override
+    @classmethod
+    def from_source(cls, source, *args, **kwargs):
+        return cls(source, *args, **kwargs)
+
+    @override
+    def task_generator(self) -> Iterable[Task]:
+        for task in self.generator():
+            new_lines = [line for idx, line in enumerate(self.lines) if idx not in task.applied]
+            source = "\n".join(new_lines)
+            try:
+                ast.parse(source)
+            except Exception as _:
+                # self.update(task, is_interesting=False)
+                continue
+            yield task.with_source(source)
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        self.update(task, is_interesting)
+
+
+class Ruff(TaskManager):
+    def __init__(self, source: str, fix_lint: bool = True, format_code: bool = True):
+        self.source = source
+        self.fix_lint = fix_lint
+        self.format_code = format_code
+        self.finished = False
+
+    @override
+    @classmethod
+    def from_source(cls, source: str, *args, **kwargs) -> "Ruff":
+        return cls(source)
+
+    @override
+    def task_generator(self):
+        if self.finished:
+            return
+        self.finished = True
+        try:
+            fixed_code = ruff_fix_code(self.source, fix_lint=self.fix_lint, format_code=self.format_code)
+            yield Task(source=fixed_code, applied=[], masked=[])
+        except FileNotFoundError as _:
+            return
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        pass
+
+
+def _worker_loop(input_queue, output_queue):
+    while True:
+        try:
+            task = input_queue.get()
+            if task is None:
+                break
+
+            capture_out = io.StringIO()
+            capture_err = io.StringIO()
+            success = False
+            with tempfile.NamedTemporaryFile("w", suffix=".py", delete=True) as f:
+                f.write(task)
+                f.flush()
+                try:
+                    with contextlib.redirect_stdout(capture_out), contextlib.redirect_stderr(capture_err):
+                        code = compile(task, f.name, "exec")
+                        exec(code, {"__builtins__": __builtins__})
+                    success = True
+                except SystemExit as e:
+                    capture_err.write(f"SystemExit: Code {e.code}\n")
+                except Exception:
+                    traceback.print_exc(file=capture_err)
+
+            output_queue.put((capture_out.getvalue(), capture_err.getvalue(), success))
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            output_queue.put(("", f"Critical: {e}", False))
+
+
+# This class is written by Gemini
+class AsyncPythonRunner:
+    def __init__(self):
+        self.process = None
+        self.input_queue = None
+        self.output_queue = None
+        self.lock = asyncio.Lock()
+
+    def start_proc(self):
+        if self.process and self.process.is_alive():
+            return
+        ctx = multiprocessing.get_context("spawn")
+        self.input_queue = ctx.Queue()
+        self.output_queue = ctx.Queue()
+        self.process = ctx.Process(target=_worker_loop, args=(self.input_queue, self.output_queue), daemon=True)
+        self.process.start()
+
+    def stop_proc(self):
+        if self.process:
+            # Try to send a stop signal.
+            # Note: if the queue is full or broken, put may block, so wrap it in a try.
+            with contextlib.suppress(Exception):
+                self.input_queue.put_nowait(None)
+
+            self.process.join(timeout=0.5)
+            if self.process.is_alive():
+                self.process.terminate()
+        self.process = None
+
+    def __enter__(self):
+        self.start_proc()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.stop_proc()
+
+    async def run(self, code: str, timeout: float = 5.0):
+        async with self.lock:
+            if not self.process or not self.process.is_alive():
+                self.start_proc()
+
+            try:
+                self.input_queue.put(code)
+            except Exception as e:
+                # Rare case: the pipe is broken.
+                return "", f"Queue Error: {e}", False
+
+            start_time = time.time()
+            while True:
+                # 1. Check whether we timed out.
+                if time.time() - start_time > timeout:
+                    self._handle_timeout(timeout)
+                    return "", f"TimeoutError: Exceeded {timeout}s", False
+
+                # 2. Check whether the child process is still alive (avoid hanging if it segfaults).
+                if not self.process.is_alive():
+                    # Try one last read (in case the result was just written before the process exited).
+                    try:
+                        return self.output_queue.get_nowait()
+                    except queue.Empty:
+                        self.process = None  # Mark as needing restart.
+                        return "", "Error: Worker process died unexpectedly", False
+
+                # 3. Try to read results in a non-blocking way.
+                try:
+                    # get_nowait raises queue.Empty immediately if the queue is empty.
+                    result = self.output_queue.get_nowait()
+                    return result
+                except queue.Empty:
+                    # No data in the queue yet, sleep briefly and yield control back to the event loop.
+                    # A 0.05s delay is perfectly acceptable for interactive usage.
+                    await asyncio.sleep(0.05)
+
+    def _handle_timeout(self, timeout):
+        """Handle cleanup logic when a timeout happens."""
+        # We must force-terminate because exec may still be stuck in a tight loop.
+        if self.process and self.process.is_alive():
+            self.process.terminate()
+            # Give the OS a bit of time to reclaim resources.
+            self.process.join(timeout=0.5)
+
+        # Mark as None so that the next run triggers start_proc and restarts the worker.
+        self.process = None
+        self.input_queue = None
+        self.output_queue = None
+
+
+class SubProcRunner:
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+    async def run(self, code: str, timeout: float = 5.0):
+        with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
+            f.write(code)
+
+        def run_subprocess(args):
+            try:
+                proc = subprocess.run(
+                    args,
+                    capture_output=True,
+                    text=True,  # Decodes output as strings (Python 3.5+)
+                    timeout=timeout,  # Timeout
+                    check=False,  # Do not raise exception for non-zero exit codes
+                )
+                return proc.stdout, proc.stderr, proc.returncode == 0
+            except subprocess.TimeoutExpired:
+                return "", f"TimeoutError: Exceeded {timeout}s", False
+
+        result = await asyncio.get_running_loop().run_in_executor(None, run_subprocess, ["python3", f.name])
+        with contextlib.suppress(OSError):
+            os.remove(f.name)
+        return result
+
+
+def clean_empty_pass(code: str) -> str:
+    tree = ast.parse(code)
+
+    class PassRemover(ast.NodeTransformer):
+        def clean_body(self, body: list[ast.stmt], keep_one=True) -> list[ast.stmt]:
+            if body is None:
+                return None
+            res = [stmt for stmt in body if not isinstance(stmt, ast.Pass)]
+            if not res and keep_one:
+                return [ast.Pass()]
+            return res
+
+        def visit_For(self, node: ast.For) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_If(self, node: ast.If) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            node.orelse = self.clean_body(node.orelse, keep_one=False)
+            return node
+
+        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_ClassDef(self, node):
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_Module(self, node: ast.Module) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_With(self, node: ast.With) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_AsyncWith(self, node: ast.AsyncWith) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_While(self, node: ast.While) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_Try(self, node: ast.Try) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            node.orelse = self.clean_body(node.orelse)
+            node.finalbody = self.clean_body(node.finalbody)
+            for handler in node.handlers:
+                handler.body = self.clean_body(handler.body)
+            return node
+
+        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_AsyncFor(self, node: ast.AsyncFor) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_ExceptHandler(self, node: ast.ExceptHandler) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+    new_tree = PassRemover().visit(tree)
+    return ast.unparse(new_tree)
+
+
+JobBackend = Literal["subproc", "runner"]
+
+
+@dataclass
+class ParTaskManager:
+    err_msg: str
+    text: str
+    output_file: Path
+    timeout: int = 60
+    num_workers: int = 1
+    backend: JobBackend = "runner"
+    allow_larger: bool = False
+
+    def __post_init__(self):
+        self.worker_tasks: list[asyncio.Task] = []
+        self.stopped = False
+        self.task_manager: TaskManager | None = None
+        self.generator: Iterable[Task] | None = None
+        self.condition = asyncio.Condition()
+        self.waiting_workers = 0
+        self.finished = True
+        self.task_counter = 0
+        self.updated = False
+
+    @property
+    def text_len(self):
+        return len(self.text)
+
+    def reset(self, task_manager: TaskManager):
+        self.task_manager = task_manager
+        self.generator = task_manager.task_generator()
+        self.finished = False
+
+    async def get_next_task(self) -> "Task | None":
+        async with self.condition:
+            while True:
+                if self.stopped:
+                    return None
+                if self.finished or self.generator is None:
+                    await self.condition.wait()
+                    continue
+                try:
+                    result = deepcopy(next(self.generator))
+                    self.task_counter += 1
+                    if self.task_counter % self.num_workers == 0:
+                        print(f"Dispatched {self.task_counter} tasks")
+                    return result
+                except StopIteration:
+                    self.waiting_workers += 1
+                    if self.waiting_workers == self.num_workers:
+                        self.finished = True
+                        self.generator = None
+                        self.condition.notify_all()
+                    await self.condition.wait()
+                    self.waiting_workers -= 1
+
+    async def submit_result(self, task: Task, is_interested: bool):
+        async with self.condition:
+            self.task_manager.task_update(task, is_interested)
+            if is_interested:
+                self.generator = self.task_manager.task_generator()
+                self.condition.notify_all()
+                text = self.post_proc(task.source)
+                if len(text) <= self.text_len or self.allow_larger:
+                    print("Accept length", len(text))
+                    self.text = text
+                    self.output_file.write_text(text)
+                    self.updated = True
+
+    def post_proc(self, text):
+        return clean_empty_pass(text)
+
+    async def worker(self, wid: int):
+        runner = AsyncPythonRunner() if self.backend == "runner" else SubProcRunner()
+        with runner:
+            while True:
+                task = await self.get_next_task()
+                if task is None:
+                    break
+                out, err, ok = await runner.run(task.source, timeout=self.timeout)
+                is_interested = self.err_msg in out or self.err_msg in err
+                await self.submit_result(task, is_interested)
+
+    async def start_workers(self):
+        if self.worker_tasks:
+            return
+        self.stopped = False
+        self.worker_tasks = [asyncio.create_task(self.worker(wid)) for wid in range(self.num_workers)]
+
+    async def stop_workers(self):
+        if not self.worker_tasks:
+            return
+        self.stopped = True
+        async with self.condition:
+            self.condition.notify_all()
+        await asyncio.gather(*self.worker_tasks, return_exceptions=True)
+        self.worker_tasks = []
+        self.generator = None
+
+    async def run_async(self, task_manager: TaskManager):
+        await self.start_workers()
+        self.reset(task_manager)
+        best_length = self.text_len
+        async with self.condition:
+            self.condition.notify_all()
+            while not self.finished:
+                await self.condition.wait()
+        return self.text_len < best_length
+
+    async def run_with(self, cls: type[TaskManager], *args, **kwargs):
+        allow_larger = kwargs.pop("allow_larger", False)
+        if allow_larger:
+            self.allow_larger = True
+            self.updated = False
+        task_manager = cls.from_source(self.text, *args, **kwargs)
+        res = await self.run_async(task_manager)
+        self.allow_larger = False
+        if allow_larger:
+            return self.updated
+        return res
+
+
+class Args(NamedTuple):
+    source: Path
+    err_msg: str
+    output: Path
+    backend: JobBackend
+    timeout: int
+    jobs: int
+
+
+async def main(args: Args):
+    if not args.source.exists() or not args.source.is_file():
+        raise FileNotFoundError(f"Source file '{args.source}' does not exist or is not a regular file.")
+    if not os.access(args.source, os.R_OK):
+        raise OSError(f"Source file '{args.source}' is not readable.")
+    try:
+        source = args.source.read_text()
+    except OSError as e:
+        raise OSError(f"Failed to read source file '{args.source}': {e}") from e
+
+    manager = ParTaskManager(
+        err_msg=args.err_msg,
+        text=source,
+        output_file=args.output,
+        timeout=args.timeout,
+        backend=args.backend,
+        num_workers=args.jobs,
+    )
+
+    # remove any statement
+
+    for_bind_0 = ASTPatRewrite.from_code(
+        name="for-bind-0",
+        kind="stmt",
+        match="for VARS in EXPR: BODY",
+        rewrite="VARS = ZEROS\nBODY",
+        placeholders={"VARS", "EXPR", "BODY", "ZEROS"},
+        derived={
+            "ZEROS": lambda ph: expr_to_zeros(ph["VARS"]),
+        },
+    )
+
+    with_bind_0 = ASTPatRewrite.from_code(
+        name="with-bind-0",
+        kind="stmt",
+        match="with EXPR as VARS: BODY",
+        rewrite="with EXPR:\n  VARS = ZEROS\n  BODY",
+        placeholders={"VARS", "EXPR", "BODY", "ZEROS"},
+        derived={
+            "ZEROS": lambda ph: expr_to_zeros(ph["VARS"]),
+        },
+    )
+
+    assign_rhs_1 = ASTPatRewrite.from_code(
+        name="assign-rhs-1",
+        kind="stmt",
+        match="VAR = EXPR",
+        rewrite="VAR = 1",
+        placeholders={"VAR", "EXPR"},
+    )
+
+    if_remover_1 = ASTPatRewrite.from_code(
+        name="if-remover-1",
+        kind="stmt",
+        match="if COND: BODY",
+        rewrite="BODY",
+        placeholders={"COND", "BODY"},
+    )
+
+    if_remover_2 = ASTPatRewrite.from_code(
+        name="if-remover-2",
+        kind="stmt",
+        match="if COND: BODY\nelse: ELSE_BODY",
+        rewrite="BODY",
+        placeholders={"COND", "BODY", "ELSE_BODY"},
+    )
+
+    if_remover_3 = ASTPatRewrite.from_code(
+        name="if-remover-3",
+        kind="stmt",
+        match="if COND: BODY\nelse: ELSE_BODY",
+        rewrite="ELSE_BODY",
+        placeholders={"COND", "BODY", "ELSE_BODY"},
+    )
+
+    # replace all integer constant x with x // 2
+    int_reduce = IntConstApply(lambda x: x > 1, lambda x: x // 2, "int-reduce-2")
+
+    # 1. first, we only do statement level fast reductions
+    fast_reducers = [
+        if_remover_1,
+        if_remover_2,
+        if_remover_3,
+        for_bind_0,
+        GeneralRemove("stmt-remover", ast.stmt, replace_with=ast.Pass()),
+    ]
+
+    # 2. canonicalizer enables more simplifications
+    canonicalizers = [
+        with_bind_0,
+        AttachFullFuncArgs(),
+    ]
+
+    # 3. simplifiers
+    simplifiers = [
+        assign_rhs_1,
+        CallFwdArg1(),
+        BinOpFwdArg("left"),
+        BinOpFwdArg("right"),
+        GeneralRemove("func-arg-remover", ast.arg),
+    ] + fast_reducers
+
+    # 4. finally apply expr level slow reductions
+    slow_reducers = [
+        GeneralRemove("func-arg-remover", ast.arg),
+        GeneralRemove("general-expr-remover", ast.expr),
+        GeneralRemove("general-keyword-remover", ast.keyword),
+    ] + fast_reducers
+
+    await manager.start_workers()
+    manager.text = manager.post_proc(manager.text)
+    try:
+        while True:
+            changed = False
+            while await manager.run_with(ASTPDD, fast_reducers):
+                changed = True
+            await manager.run_with(ASTPDD, canonicalizers, allow_larger=True)
+            while await manager.run_with(ASTPDD, simplifiers):
+                changed = True
+            while await manager.run_with(ASTPDD, [int_reduce], allow_larger=True):
+                changed = True
+            while await manager.run_with(ASTPDD, slow_reducers):
+                changed = True
+            if not changed:
+                break
+    finally:
+        await manager.stop_workers()
+
+
+def cli_main(argv: "Sequence[str] | None" = None) -> None:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser(
+        usage="python -m tilelang.autodd source --err-msg MSG -o OUTPUT [--backend {runner,subproc}] [--timeout SEC] [-j N]",
+        description="Delta-debug the provided Python source until the target error message remains reproducible.",
+        epilog="Author: Kexing Zhou <zhoukexing@pku.edu.cn>",
+    )
+    parser.add_argument("source", type=Path, help="Input python source file")
+    parser.add_argument("--err-msg", type=str, required=True, help="Error message to look for")
+    parser.add_argument("-o", "--output", type=Path, required=True, help="Output file path")
+    parser.add_argument(
+        "--backend", default="runner", choices=["runner", "subproc"], help="Backend for running code: runner is faster, subproc is stable"
+    )
+    parser.add_argument("--timeout", type=int, default=60, help="Timeout for each task in seconds (default: 60)")
+    parser.add_argument("-j", "--jobs", type=int, default=1, help="Number of parallel jobs (default: 1)")
+    ns = parser.parse_args(argv)
+
+    args = Args(
+        source=ns.source,
+        err_msg=ns.err_msg,
+        output=ns.output,
+        backend=ns.backend,
+        timeout=ns.timeout,
+        jobs=ns.jobs,
+    )
+    asyncio.run(main(args))
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/tilelang/autotuner/capture.py b/tilelang/autotuner/capture.py
index 27c24f14e..428a6da90 100644
--- a/tilelang/autotuner/capture.py
+++ b/tilelang/autotuner/capture.py
@@ -85,8 +85,7 @@ def _get_current_stack() -> CaptureStack:
 
 
 class AutotuneInputsCapture:
-
-    __slots__ = ("tensors")
+    __slots__ = "tensors"
 
     def __init__(self, tensors: list[Any]):
         self.tensors = tensors
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index b93c4448e..4aac8c0ba 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -1,5 +1,5 @@
-"""The auto-tune parameters.
-"""
+"""The auto-tune parameters."""
+
 from __future__ import annotations
 
 import tilelang
@@ -13,18 +13,25 @@
 from tilelang.jit import JITKernel
 import cloudpickle
 import os
-import shutil
 from tilelang.engine.param import KernelParam
 from tilelang import logger
 import json
 import hashlib
+import uuid
+from tilelang import env
+from tvm.runtime import Executable
 
 BEST_CONFIG_PATH = "best_config.json"
 FUNCTION_PATH = "function.pkl"
 LATENCY_PATH = "latency.json"
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+
+# Align file names with cache/kernel_cache.py
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
+KERNEL_CUBIN_PATH = "kernel.cubin"
+KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
 
@@ -33,7 +40,7 @@ class CompileArgs:
     """Compile arguments for the auto-tuner. Detailed description can be found in `tilelang.jit.compile`.
     Attributes:
         out_idx: List of output tensor indices.
-        execution_backend: Execution backend to use for kernel execution (default: "cython").
+        execution_backend: Execution backend to use for kernel execution (default: "auto").
         target: Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host: Target host for cross-compilation (default: None).
         verbose: Whether to enable verbose output (default: False).
@@ -42,8 +49,8 @@ class CompileArgs:
     """
 
     out_idx: list[int] | int | None = None
-    execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython"
-    target: Literal['auto', 'cuda', 'hip'] = 'auto'
+    execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] = "auto"
+    target: Literal["auto", "cuda", "hip"] = "auto"
     target_host: str | Target = None
     verbose: bool = False
     pass_configs: dict[str, Any] | None = None
@@ -55,24 +62,20 @@ def compile_program(self, program: PrimFunc):
             target=self.target,
             target_host=self.target_host,
             verbose=self.verbose,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
 
     def __hash__(self):
         data = {
-            "execution_backend":
-                self.execution_backend,
-            "target":
-                str(self.target),
-            "target_host":
-                str(self.target_host) if self.target_host else None,
-            "verbose":
-                self.verbose,
-            "pass_configs":
-                json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
+            "execution_backend": self.execution_backend,
+            "target": str(self.target),
+            "target_host": str(self.target_host) if self.target_host else None,
+            "verbose": self.verbose,
+            "pass_configs": json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
         }
 
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -83,6 +86,7 @@ class ProfileArgs:
         warmup: Number of warmup iterations.
         rep: Number of repetitions for timing.
         timeout: Maximum time per configuration.
+        backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph".
         supply_type: Type of tensor supply mechanism.
         ref_prog: Reference program for correctness validation.
         supply_prog: Supply program for input tensors.
@@ -97,9 +101,11 @@ class ProfileArgs:
         manual_check_prog: Callable = None
         cache_input_tensors: bool = True
     """
+
     warmup: int = 25
     rep: int = 100
     timeout: int = 30
+    backend: Literal["event", "cupti", "cudagraph"] = "event"
     supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto
     ref_prog: Callable = None
     supply_prog: Callable = None
@@ -115,13 +121,14 @@ def __hash__(self):
             "warmup": self.warmup,
             "rep": self.rep,
             "timeout": self.timeout,
+            "backend": self.backend,
             "supply_type": str(self.supply_type),
             "rtol": self.rtol,
             "atol": self.atol,
             "max_mismatched_ratio": self.max_mismatched_ratio,
         }
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -136,6 +143,7 @@ class AutotuneResult:
         func: Optimized function.
         kernel: Compiled kernel function.
     """
+
     latency: float | None = None
     config: dict | None = None
     ref_latency: float | None = None
@@ -143,6 +151,31 @@ class AutotuneResult:
     func: Callable | None = None
     kernel: Callable | None = None
 
+    @staticmethod
+    def _load_binary(path: str):
+        with open(path, "rb") as file:
+            binary = file.read()
+        return binary
+
+    @staticmethod
+    def _safe_write_file(path: str, mode: str, operation: Callable[[Any], None]):
+        # Random a temporary file within the same FS as the cache directory
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}")
+        with open(temp_path, mode) as temp_file:
+            operation(temp_file)
+        # Use atomic POSIX replace, so other processes cannot see a partial write
+        os.replace(temp_path, path)
+
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
     def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
@@ -161,34 +194,68 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: boo
         """
         os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
 
-        # Save kernel source code
+        # Save device kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving kernel source code to file: {kernel_path}")
+                logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                with open(kernel_path, "w") as f:
-                    f.write(kernel.kernel_source)
+                self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
         except Exception as e:
             logger.error(f"Error saving kernel source code to disk: {e}")
 
-        # Save wrapped kernel source code
+        # Save host kernel source code (wrapped)
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path, "w") as f:
-                f.write(kernel.get_kernel_source())
+                logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
+            if kernel.execution_backend == "tvm_ffi":
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
+            else:
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             logger.error(f"Error saving wrapped kernel source code to disk: {e}")
 
-        # Save kernel library
+        # Save kernel library (backend-specific)
         try:
-            kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            shutil.copy(src_lib_path, kernel_lib_path)
+            if kernel.execution_backend == "nvrtc":
+                kernel_lib_file = KERNEL_CUBIN_PATH
+            elif kernel.execution_backend == "tvm_ffi":
+                kernel_lib_file = EXECUTABLE_PATH
+            else:
+                kernel_lib_file = KERNEL_LIB_PATH
+
+            kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+
+            if kernel.execution_backend == "nvrtc":
+                # Save cubin and python helper file
+                src_lib_path = kernel.adapter.libpath
+                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                py_src_path = src_lib_path.replace(".cubin", ".py")
+                if verbose:
+                    logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+            elif kernel.execution_backend == "tvm_ffi":
+                if hasattr(kernel.adapter, "libpath") and kernel.adapter.libpath:
+                    src_lib_path = kernel.adapter.libpath
+                    if verbose:
+                        logger.debug(f"Copying kernel library to file: {kernel_lib_path}")
+                    self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+                else:
+                    executable = kernel.adapter.executable
+                    if verbose:
+                        logger.debug(f"Saving kernel executable to file: {kernel_lib_path}")
+                    self._safe_write_executable(executable, kernel_lib_path)
+            else:
+                src_lib_path = kernel.adapter.libpath
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
 
@@ -197,8 +264,7 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: boo
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            with open(params_path, "wb") as f:
-                cloudpickle.dump(kernel.params, f)
+            self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
         except Exception as e:
             logger.error(f"Error saving kernel parameters to disk: {e}")
 
@@ -208,8 +274,9 @@ def _load_kernel_from_disk(
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] | int | None = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
+        compile_flags: list[str] | str | None = None,
         func: Callable = None,
         verbose: bool = False,
     ) -> JITKernel:
@@ -233,23 +300,46 @@ def _load_kernel_from_disk(
         if not os.path.exists(cache_path):
             return None
 
-        kernel_global_source: str | None = None
+        # Resolve backend to pick correct file names
+        if execution_backend == "nvrtc":
+            kernel_lib_file = KERNEL_CUBIN_PATH
+        elif execution_backend == "tvm_ffi":
+            kernel_lib_file = EXECUTABLE_PATH
+        else:
+            kernel_lib_file = KERNEL_LIB_PATH
+
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+        params_path = os.path.join(cache_path, PARAMS_PATH)
+
+        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+            return None
+
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
+        # Load optional device kernel source
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
+                logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
         except Exception as e:
-            logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+            logger.error(f"Error loading kernel source code from disk: {e}")
 
-        kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
+        # Load optional host kernel source
+        try:
+            if verbose:
+                logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception as e:
+            logger.error(f"Error loading host kernel source code from disk: {e}")
 
         # Load kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
@@ -257,10 +347,11 @@ def _load_kernel_from_disk(
         except Exception as e:
             logger.error(f"Error loading kernel parameters from disk: {e}")
 
-        if kernel_global_source and kernel_params:
+        if host_kernel_source and device_kernel_source and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -268,6 +359,7 @@ def _load_kernel_from_disk(
                 out_idx=out_idx,
                 execution_backend=execution_backend,
                 pass_configs=pass_configs,
+                compile_flags=compile_flags,
             )
         else:
             return None
@@ -276,26 +368,30 @@ def save_to_disk(self, path: Path, verbose: bool = False):
         if not os.path.exists(path):
             os.makedirs(path)
 
-        # save best config
+        # save best config (atomic)
         if verbose:
             logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        with open(path / BEST_CONFIG_PATH, "w") as f:
-            json.dump(self.config, f)
+        self._safe_write_file(str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
 
-        # save function
+        # save function (atomic)
         if verbose:
             logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        with open(path / FUNCTION_PATH, "wb") as f:
-            cloudpickle.dump(self.func, f)
+        self._safe_write_file(str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
-        # save ref latency
+        # save ref latency (atomic)
         if verbose:
             logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
-        with open(path / LATENCY_PATH, "w") as f:
-            json.dump({
-                "latency": self.latency,
-                "ref_latency": self.ref_latency,
-            }, f)
+        self._safe_write_file(
+            str(path / LATENCY_PATH),
+            "w",
+            lambda f: json.dump(
+                {
+                    "latency": self.latency,
+                    "ref_latency": self.ref_latency,
+                },
+                f,
+            ),
+        )
 
         # save kernel
         self._save_kernel_to_disk(path, self.kernel)
@@ -306,6 +402,13 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             return None
 
         verbose = compile_args.verbose
+        # Normalize target and resolve execution backend for loading
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        norm_target = Target(_determine_target(compile_args.target)) if isinstance(compile_args.target, str) else compile_args.target
+        requested_backend = compile_args.execution_backend
+        resolved_backend = resolve_execution_backend(requested_backend, norm_target)
         # load best config
         if verbose:
             logger.debug(f"Loading best config from file: {path / BEST_CONFIG_PATH}")
@@ -325,10 +428,17 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             latency = json.load(f)
             latency, ref_latency = latency["latency"], latency["ref_latency"]
 
-        kernel = cls._load_kernel_from_disk(cls, path, compile_args.target,
-                                            compile_args.target_host, compile_args.out_idx,
-                                            compile_args.execution_backend,
-                                            compile_args.pass_configs, func)
+        kernel = cls._load_kernel_from_disk(
+            cls,
+            path,
+            norm_target,
+            compile_args.target_host,
+            compile_args.out_idx,
+            resolved_backend,
+            compile_args.pass_configs,
+            None,  # compile_flags not tracked here
+            func,
+        )
         if kernel is None:
             return None
         kernel.update_tuner_result(
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 4027c6197..8243b6b83 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -3,18 +3,21 @@
 This module provides functionality for auto-tuning tilelang programs, including JIT compilation
 and performance optimization through configuration search.
 """
+
 from __future__ import annotations
 from dataclasses import dataclass
 
 import tilelang
 from tilelang import tvm as tvm
+from tilelang import env
 from tilelang.jit import JITImpl
 from tilelang.jit.kernel import JITKernel
 from tvm.tir import PrimFunc, Var
 from tvm.target import Target
 import inspect
 from functools import partial
-from typing import (Callable, Generic, Literal, Any, TypeVar)
+from typing import Callable, Generic, Literal, Any, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
@@ -33,8 +36,8 @@
 import traceback
 from pathlib import Path
 
-from tilelang import env
 from tilelang.autotuner.param import CompileArgs, ProfileArgs, AutotuneResult
+from tilelang.utils.language import get_prim_func_name
 from tilelang.autotuner.capture import get_autotune_inputs
 from tilelang.utils.target import determine_target
 from tilelang import __version__
@@ -74,8 +77,8 @@ def _init_logger_handlers():
     global _logger_handlers_initialized
     if _logger_handlers_initialized:
         return
-    formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
-    file_handler = logging.FileHandler('autotuner.log', mode='w')
+    formatter = logging.Formatter("%(asctime)s %(levelname)s:%(message)s")
+    file_handler = logging.FileHandler("autotuner.log", mode="w")
     file_handler.setLevel(logging.DEBUG)
     file_handler.setFormatter(formatter)
     console_handler = logging.StreamHandler(sys.stdout)
@@ -87,8 +90,7 @@ def _init_logger_handlers():
 
 
 def get_available_cpu_count() -> int:
-    """Gets the number of CPU cores available to the current process.
-    """
+    """Gets the number of CPU cores available to the current process."""
     try:
         cpu_count = len(os.sched_getaffinity(0))
     except AttributeError:
@@ -107,6 +109,7 @@ class AutoTuner:
         fn: The function to be auto-tuned.
         configs: List of configurations to try during auto-tuning.
     """
+
     compile_args = CompileArgs()
     profile_args = ProfileArgs()
 
@@ -137,49 +140,76 @@ def from_kernel(cls, kernel: Callable, configs):
         """
         return cls(kernel, configs)
 
-    def set_compile_args(self,
-                         out_idx: list[int] | int | None = None,
-                         target: Literal['auto', 'cuda', 'hip'] = 'auto',
-                         execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
-                         target_host: str | Target = None,
-                         verbose: bool = False,
-                         pass_configs: dict[str, Any] | None = None):
+    def set_compile_args(
+        self,
+        out_idx: list[int] | int | None = None,
+        target: Literal["auto", "cuda", "hip", "metal"] | None = None,
+        execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] | None = None,
+        target_host: str | Target | None = None,
+        verbose: bool | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         """Set compilation arguments for the auto-tuner.
 
         Args:
             out_idx: List of output tensor indices.
-            target: Target platform.
-            execution_backend: Execution backend to use for kernel execution.
+            target: Target platform. If None, reads from TILELANG_TARGET environment variable (defaults to "auto").
+            execution_backend: Execution backend to use for kernel execution. If None, reads from
+                TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
             target_host: Target host for cross-compilation.
-            verbose: Whether to enable verbose output.
+            verbose: Whether to enable verbose output. If None, reads from
+                TILELANG_VERBOSE environment variable (defaults to False).
             pass_configs: Additional keyword arguments to pass to the Compiler PassContext.
 
+        Environment Variables:
+            TILELANG_TARGET: Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+            TILELANG_EXECUTION_BACKEND: Default execution backend. Defaults to "auto".
+            TILELANG_VERBOSE: Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
+
         Returns:
             AutoTuner: Self for method chaining.
         """
+        # Apply environment variable defaults if parameters are not explicitly set
+        if target is None:
+            target = env.get_default_target()
+        if execution_backend is None:
+            execution_backend = env.get_default_execution_backend()
+        if verbose is None:
+            verbose = env.get_default_verbose()
+
+        # Normalize target to a concrete TVM Target and resolve execution backend
+        t = Target(determine_target(target))
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        resolved_backend = resolve_execution_backend(execution_backend, t)
+
         self.compile_args = CompileArgs(
             out_idx=out_idx,
-            target=Target(determine_target(target)),
-            execution_backend=execution_backend,
+            target=t,
+            execution_backend=resolved_backend,
             target_host=target_host,
             verbose=verbose,
-            pass_configs=pass_configs)
+            pass_configs=pass_configs,
+        )
 
         return self
 
-    def set_profile_args(self,
-                         warmup: int = 25,
-                         rep: int = 100,
-                         timeout: int = 30,
-                         supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
-                         ref_prog: Callable = None,
-                         supply_prog: Callable = None,
-                         rtol: float = 1e-2,
-                         atol: float = 1e-2,
-                         max_mismatched_ratio: float = 0.01,
-                         skip_check: bool = False,
-                         manual_check_prog: Callable = None,
-                         cache_input_tensors: bool = False):
+    def set_profile_args(
+        self,
+        warmup: int = 25,
+        rep: int = 100,
+        timeout: int = 30,
+        supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
+        ref_prog: Callable = None,
+        supply_prog: Callable = None,
+        rtol: float = 1e-2,
+        atol: float = 1e-2,
+        max_mismatched_ratio: float = 0.01,
+        skip_check: bool = False,
+        manual_check_prog: Callable = None,
+        cache_input_tensors: bool = False,
+        backend: Literal["event", "cupti", "cudagraph"] = "event",
+    ):
         """Set profiling arguments for the auto-tuner.
 
         Args:
@@ -195,7 +225,7 @@ def set_profile_args(self,
             warmup: Number of warmup iterations.
             rep: Number of repetitions for timing.
             timeout: Maximum time per configuration.
-
+            backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph".
         Returns:
             AutoTuner: Self for method chaining.
         """
@@ -203,9 +233,7 @@ def set_profile_args(self,
         # the `supply_prog` will be ignored and the `get_autotune_inputs` will be used instead.
         if get_autotune_inputs() is not None:
             if supply_prog is not None:
-                logger.warning(
-                    "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
-                )
+                logger.warning("`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context.")
             supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
 
         self.profile_args = ProfileArgs(
@@ -220,13 +248,14 @@ def set_profile_args(self,
             cache_input_tensors=cache_input_tensors,
             warmup=warmup,
             rep=rep,
-            timeout=timeout)
+            timeout=timeout,
+            backend=backend,
+        )
 
         # If a custom `supply_prog` is provided, the profiler's `supply_type` setting
         # becomes ineffective. The custom supply program will be used instead.
         if supply_prog is not None and supply_type != tilelang.TensorSupplyType.Auto:
-            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because "
-                           "`supply_prog` is not None.")
+            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because `supply_prog` is not None.")
 
         return self
 
@@ -235,9 +264,8 @@ def set_kernel_parameters(self, k_parameters: tuple[str, ...], f_parameters: dic
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
 
-    def generate_cache_key(self, parameters: dict[str, Any]) -> AutotuneResult | None:
-        """Generate a cache key for the auto-tuning process.
-        """
+    def generate_cache_key(self, parameters: dict[str, Any], extra_parameters: dict[str, Any]) -> AutotuneResult | None:
+        """Generate a cache key for the auto-tuning process."""
 
         def _normalize_param(value):
             if isinstance(value, Var):
@@ -261,6 +289,7 @@ def _normalize_param(value):
         key_data = {
             "version": __version__,
             "op_parameters": tuple(op_parameters),
+            "extra_parameters": extra_parameters,
             "func_source": func_source,
             "configs": self.configs,
             "compile_args": hash(self.compile_args),
@@ -293,18 +322,43 @@ def run(self, warmup: int = 25, rep: int = 100, timeout: int = 30):
         sig = inspect.signature(self.fn)
         parameters = sig.parameters
 
+        # NOTE(chaofan):  We need to extract some parameters from the closure.
+        # Consider the case:
+        #   def gemm(M, N, K):
+        #       def kernel(...)
+        # If we only extract source, M/N/K will be symbolic and there will be cache problem.
+        extra_parameters: dict[str, Any] = {}
+        cells = self.fn.__closure__
+        var_names = self.fn.__code__.co_freevars
+        if cells is not None:
+            assert len(var_names) == len(cells), "Number of free variables does not match"
+            for var_name, cell in zip(var_names, cells):
+                if var_name in parameters:
+                    continue
+                # Cell content must be serializable
+                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), (
+                    f"Cell contents {cell.cell_contents} is not serializable: {type(cell.cell_contents)}"
+                )
+                extra_parameters[var_name] = cell.cell_contents
+
         if isinstance(self.configs, Callable):
             self.configs = self.configs(*self._kernel_parameters)
 
-        key = self.generate_cache_key(parameters)
+        key = self.generate_cache_key(parameters, extra_parameters)
 
         with self._lock:
-            if env.is_cache_enabled():
+            if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
-                    logger.warning("Found kernel in memory cache. For better performance," \
-                                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.")
-                    return self._memory_cache[key]
+                    # Include PrimFunc name when hitting autotuner memory cache
+                    cached_result = self._memory_cache[key]
+                    prim = getattr(cached_result, "func", None)
+                    kernel_name = get_prim_func_name(prim, "<unknown>")
+                    logger.warning(
+                        "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.",
+                        kernel_name,
+                    )
+                    return cached_result
 
                 # Then check disk cache
                 result = self._load_result_from_disk(key)
@@ -336,6 +390,7 @@ def target_fn(jit_kernel: tilelang.JITKernel):
             rtol = profile_args.rtol
             atol = profile_args.atol
             max_mismatched_ratio = profile_args.max_mismatched_ratio
+            backend = profile_args.backend
 
             profiler = jit_kernel.get_profiler(tensor_supply_type=supply_type)
 
@@ -343,7 +398,6 @@ def target_fn(jit_kernel: tilelang.JITKernel):
             # This encapsulates the logic of using either a custom supply program (`supply_prog`)
             # or the default profiler input generation (`profiler._get_inputs`).
             def get_input_tensors_supply(with_output: bool):
-
                 def func():
                     if supply_prog is not None:
                         return supply_prog(profiler._get_params(with_output=with_output))
@@ -361,8 +415,7 @@ def func():
                     self.jit_input_tensors = jit_input_tensors_supply()
                 else:
                     # check if the cached tensors are compatible with the current configuration
-                    assert len(params) == len(
-                        self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
+                    assert len(params) == len(self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
                     for p, c in zip(params, self.jit_input_tensors):
                         if not isinstance(c, torch.Tensor):
                             # skip non-tensor inputs checking
@@ -371,8 +424,8 @@ def func():
                         # Check tensor compatibility using generator expression
                         def shape_equal(a, b):
                             return all(
-                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var)
-                                for a_dim, b_dim in zip(a.shape, b.shape))
+                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var) for a_dim, b_dim in zip(a.shape, b.shape)
+                            )
 
                         if p.dtype != c.dtype or not shape_equal(p, c):
                             logger.warning(
@@ -383,7 +436,8 @@ def shape_equal(a, b):
                                 "To ensure fresh, compatible inputs are generated for every trial "
                                 "you can disable caching by setting:\n"
                                 "  `cache_input_tensors=False`\n"
-                                "within your `.set_compile_args(...)` call.\n")
+                                "within your `.set_compile_args(...)` call.\n"
+                            )
                             # otherwise, regenerate the input tensors for safety
                             self.jit_input_tensors = jit_input_tensors_supply()
                             break
@@ -392,24 +446,22 @@ def shape_equal(a, b):
 
             if (not skip_check) and (ref_prog is not None):
                 if manual_check_prog is not None:
-                    profiler.manual_assert_close(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        manual_check_prog=manual_check_prog)
+                    profiler.manual_assert_close(ref_prog, input_tensors=self.jit_input_tensors, manual_check_prog=manual_check_prog)
                 else:
                     profiler.assert_allclose(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        rtol=rtol,
-                        atol=atol,
-                        max_mismatched_ratio=max_mismatched_ratio)
-            latency = profiler.do_bench(
-                warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
+                        ref_prog, input_tensors=self.jit_input_tensors, rtol=rtol, atol=atol, max_mismatched_ratio=max_mismatched_ratio
+                    )
+            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors, backend=backend)
 
             if self.ref_latency_cache is None and ref_prog is not None:
                 self.ref_input_tensors = ref_input_tensors_supply()
                 self.ref_latency_cache = profiler.do_bench(
-                    ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
+                    ref_prog,
+                    n_warmup=warmup,
+                    n_repeat=rep,
+                    input_tensors=self.ref_input_tensors,
+                    backend=backend,
+                )
 
             return latency, self.ref_latency_cache
 
@@ -443,17 +495,14 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
 
             # Check if all tunable arguments have been tuned by comparing config keys with key_kwargs_tuple
             if any(key in top_config for key, _ in key_kwargs_tuple) or any(
-                    check_tunable_argument_value(key, self._function_parameters, key_args_tuple)
-                    for key in tunable_arguments):
+                check_tunable_argument_value(key, self._function_parameters, key_args_tuple) for key in tunable_arguments
+            ):
                 logger.warning(
                     f"Tunable parameters {tunable_arguments} already provided during auto-tuning. Skipping compilation and using direct JIT"
                 )
                 # compile the kernel with the provided parameters
                 jit_kernel = self.jit_compile()
-                autotuner_result = AutotuneResult(
-                    libcode=jit_kernel.get_kernel_source(),
-                    func=jit_kernel.prim_func,
-                    kernel=jit_kernel)
+                autotuner_result = AutotuneResult(libcode=jit_kernel.get_kernel_source(), func=jit_kernel.prim_func, kernel=jit_kernel)
                 self._memory_cache[key] = autotuner_result
                 return autotuner_result
         # get the cpu count
@@ -463,9 +512,7 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
         max_cpu_count = int(env.TILELANG_AUTO_TUNING_MAX_CPU_COUNT)
         if cpu_counts > 0:
             num_workers = min(cpu_counts, available_cpu_count)
-            logger.info(
-                f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used"
-            )
+            logger.info(f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used")
         else:
             num_workers = max(1, int(available_cpu_count * cpu_utilizations))
             logger.info(
@@ -483,7 +530,6 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
         future_to_index = {}
 
         def cuda_device_wrapper(func, device):
-
             def inner(**config_arg):
                 torch.cuda.set_device(device)
                 return func(**config_arg)
@@ -506,18 +552,14 @@ def inner(**config_arg):
             future_to_index[future] = i
 
         results_with_configs = []
-        for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Compiling configurations"):
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Compiling configurations"):
             idx = future_to_index[future]
             config = config_args[idx]
             try:
                 result = future.result()
                 results_with_configs.append((result, config))
             except Exception as e:
-                logger.debug(
-                    f"Compilation failed for config {config} at index {idx} with error: {e}")
+                logger.debug(f"Compilation failed for config {config} at index {idx} with error: {e}")
                 continue
 
         ref_latency = None
@@ -530,14 +572,10 @@ def inner(**config_arg):
                 # latency, ref_latency = target_fn(jit_kernel)
                 latency, ref_latency = run_with_timeout(target_fn, timeout, jit_kernel)
             except TimeoutException:
-                logger.warning(
-                    f"A timeout occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"A timeout occurred while testing config {config}, checkout autotuner.log for more details")
                 continue
             except Exception:
-                logger.warning(
-                    f"An error occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"An error occurred while testing config {config}, checkout autotuner.log for more details")
                 logger.debug(f"Error: {traceback.format_exc()}")
                 continue
 
@@ -552,8 +590,7 @@ def inner(**config_arg):
         pool.shutdown()
 
         if best_kernel is None:
-            error_msg = ("Auto-tuning failed: No configuration successfully "
-                         "compiled and passed benchmarking/validation.")
+            error_msg = "Auto-tuning failed: No configuration successfully compiled and passed benchmarking/validation."
             logger.error(error_msg)
             raise RuntimeError(error_msg)
 
@@ -569,13 +606,14 @@ def inner(**config_arg):
             ref_latency=ref_latency,
             libcode=best_kernel.get_kernel_source(),
             func=best_kernel.prim_func,
-            kernel=best_kernel)
+            kernel=best_kernel,
+        )
 
-        if self.compile_args.execution_backend in ("dlpack", "torch"):
+        if self.compile_args.execution_backend in ("torch"):
             logger.warning("DLPack backend does not support cache saving to disk.")
         else:
             with self._lock:
-                if env.is_cache_enabled():
+                if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                     self._save_result_to_disk(key, autotuner_result)
 
         self._memory_cache[key] = autotuner_result
@@ -591,8 +629,8 @@ def __call__(self) -> Any:
         return self.run()
 
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 @dataclass
@@ -617,8 +655,9 @@ def __post_init__(self):
         self._tuner_cache = {}
 
     def get_tunner(self):
-        autotuner = AutoTuner(
-            self.jit_impl.func, configs=self.configs).set_profile_args(
+        autotuner = (
+            AutoTuner(self.jit_impl.func, configs=self.configs)
+            .set_profile_args(
                 supply_type=self.supply_type,
                 ref_prog=self.ref_prog,
                 supply_prog=self.supply_prog,
@@ -628,7 +667,8 @@ def get_tunner(self):
                 skip_check=self.skip_check,
                 manual_check_prog=self.manual_check_prog,
                 cache_input_tensors=self.cache_input_tensors,
-            ).set_compile_args(
+            )
+            .set_compile_args(
                 out_idx=self.jit_impl.out_idx,
                 execution_backend=self.jit_impl.execution_backend,
                 target=self.jit_impl.target,
@@ -636,6 +676,7 @@ def get_tunner(self):
                 verbose=self.jit_impl.verbose,
                 pass_configs=self.jit_impl.pass_configs,
             )
+        )
         autotuner.run = partial(autotuner.run, self.warmup, self.rep, self.timeout)
         return autotuner
 
@@ -708,8 +749,9 @@ def autotune(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
@@ -726,16 +768,13 @@ def autotune(  # This is the new public interface
     if callable(func):
         # Case 1: Used as @autotune (func_or_out_idx is the function, others are defaults)
         # This is a placeholder for a real auto tuner implementation
-        raise ValueError(
-            "Use tilelang.autotune to decorate func without arguments is not supported yet.")
+        raise ValueError("Use tilelang.autotune to decorate func without arguments is not supported yet.")
     elif isinstance(func, PrimFunc):
         raise ValueError("Use tilelang.jit to decorate prim_func is not supported yet.")
     else:
 
         def decorator(impl):
-            assert isinstance(
-                impl, JITImpl
-            ), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
+            assert isinstance(impl, JITImpl), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
             return AutoTuneImpl(
                 jit_impl=impl,
                 configs=configs,
diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py
index c338ce61d..d0ee6c9a4 100644
--- a/tilelang/cache/__init__.py
+++ b/tilelang/cache/__init__.py
@@ -1,41 +1,89 @@
 """The cache utils with class and database persistence - Init file"""
+
 from __future__ import annotations
 
-from typing import Literal
+import logging
+from typing import TYPE_CHECKING, Literal
 from tvm.target import Target
 from tvm.tir import PrimFunc
 from tilelang.jit import JITKernel
 from tilelang import env
-from .kernel_cache import KernelCache
+from tilelang.jit.adapter.cutedsl.kernel_cache import CuTeDSLKernelCache
+from tilelang.jit.adapter.cython.kernel_cache import CythonKernelCache
+from tilelang.jit.adapter.nvrtc.kernel_cache import NVRTCKernelCache
+from tilelang.jit.adapter.torch.kernel_cache import TorchKernelCache
+from tilelang.jit.adapter.kernel_cache import TVMFFIKernelCache
+
+if TYPE_CHECKING:
+    from .kernel_cache import KernelCache
 
-# Create singleton instance of KernelCache
-_kernel_cache_instance = KernelCache()
+# Create a map of singleton instance of KernelCaches
+_dispatch_map: dict[str, KernelCache] = {
+    "tvm_ffi": TVMFFIKernelCache(),
+    "cython": CythonKernelCache(),
+    "nvrtc": NVRTCKernelCache(),
+    "cutedsl": CuTeDSLKernelCache(),
+    "torch": TorchKernelCache(),
+}
 
 
 def cached(
     func: PrimFunc = None,
     out_idx: list[int] = None,
     *args,
-    target: str | Target = "auto",
-    target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] | None = "cython",
-    verbose: bool | None = False,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    verbose: bool | None = None,
     pass_configs: dict | None = None,
     compile_flags: list[str] | str | None = None,
 ) -> JITKernel:
     """
     Caches and reuses compiled kernels (using KernelCache class).
     """
-    return _kernel_cache_instance.cached(
-        func,
-        out_idx,
-        *args,
-        target=target,
-        target_host=target_host,
-        execution_backend=execution_backend,
-        verbose=verbose,
-        pass_configs=pass_configs,
-        compile_flags=compile_flags)
+    # Apply environment variable defaults if parameters are not explicitly set
+    # This is the SINGLE source of truth for env var processing
+    if target is None:
+        target = env.get_default_target()
+    if execution_backend is None:
+        execution_backend = env.get_default_execution_backend()
+    if verbose is None:
+        verbose = env.get_default_verbose()
+
+    # Normalize target and resolve execution backend before proceeding
+    from tilelang.utils.target import determine_target as _determine_target
+    from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
+    norm_target = Target(_determine_target(target)) if isinstance(target, str) else target
+    requested_backend = execution_backend
+    execution_backend = resolve_execution_backend(requested_backend, norm_target)
+    if verbose:
+        allowed_now = allowed_backends_for_target(norm_target, include_unavailable=False)
+        # Avoid duplicate logs when caller already resolved explicitly
+        if requested_backend in (None, "auto") or requested_backend != execution_backend:
+            logger = logging.getLogger(__name__)
+            logger.setLevel(logging.INFO)
+            logger.info(
+                "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+                execution_backend,
+                requested_backend,
+                norm_target.kind.name,
+                ", ".join(sorted(allowed_now)),
+            )
+    if execution_backend in _dispatch_map:
+        return _dispatch_map[execution_backend].cached(
+            func,
+            out_idx,
+            *args,
+            target=norm_target,
+            target_host=target_host,
+            execution_backend=execution_backend,
+            verbose=verbose,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
+    else:
+        raise ValueError(f'Cannot find support for execution backend "{execution_backend}"')
 
 
 def clear_cache():
@@ -46,9 +94,11 @@ def clear_cache():
         RuntimeError: Always raised to warn users to clear the cache manually.
     """
     cache_dir = env.TILELANG_CACHE_DIR
-    raise RuntimeError("tilelang.clear_cache() is disabled because deleting the cache directory "
-                       "is dangerous. If you accept the risk, remove it manually with "
-                       f"`rm -rf '{cache_dir}'`.")
+    raise RuntimeError(
+        "tilelang.clear_cache() is disabled because deleting the cache directory "
+        "is dangerous. If you accept the risk, remove it manually with "
+        f"`rm -rf '{cache_dir}'`."
+    )
 
 
 if env.TILELANG_CLEAR_CACHE.lower() in ("1", "true", "yes", "on"):
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index d0a801fb4..0e53bf69c 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -1,31 +1,28 @@
 """The cache utils with class and database persistence - KernelCache Class"""
+
 from __future__ import annotations
 
+import functools
 import json
 import logging
 import os
 import shutil
 import threading
 import uuid
+import sys
 from hashlib import sha256
 from typing import Callable, Literal
 
 import cloudpickle
 from tvm.target import Target
 from tvm.tir import PrimFunc
-
+from tvm.runtime import Executable
 from tilelang.engine.param import KernelParam
+from tilelang.utils.language import get_prim_func_name
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
 
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
-KERNEL_LIB_PATH = "kernel_lib.so"
-KERNEL_CUBIN_PATH = "kernel.cubin"
-KERNEL_PY_PATH = "kernel.py"
-PARAMS_PATH = "params.pkl"
-
 
 class KernelCache:
     """
@@ -40,7 +37,31 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython"
+    execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi"
+    device_kernel_path = "device_kernel.cu"
+    host_kernel_path = "host_kernel.cu"
+    kernel_lib_path = "kernel_lib.so"
+    params_path = "params.pkl"
+
+    @staticmethod
+    @functools.cache
+    def _get_compile_args() -> dict:
+        if sys.platform != "darwin":
+            return {}
+
+        from torch.utils import cpp_extension
+
+        return {"options": ["-x", "objective-c++", "-g", "-std=gnu++17"] + ["-I" + i for i in cpp_extension.include_paths()]}
+
+    @staticmethod
+    @functools.cache
+    def _get_base_key() -> dict:
+        base = {"version": __version__}
+        if sys.platform == "darwin":
+            import torch
+
+            base["torch"] = torch.__version__
+        return base
 
     def __new__(cls):
         """
@@ -69,7 +90,7 @@ def _generate_key(
         self,
         func: Callable,
         out_idx: list[int],
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         args=None,
         target: str | Target = "auto",
         target_host: str | Target = None,
@@ -82,7 +103,7 @@ def _generate_key(
         Args:
             func (Callable): The function to be compiled.
             out_idx (List[int]): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             args: Arguments passed to the function.
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
@@ -91,19 +112,17 @@ def _generate_key(
             str: SHA256 hash key for the kernel configuration.
         """
         self.execution_backend = execution_backend
-        func_binary = cloudpickle.dumps(func.script(show_meta=True))
+        func_binary = func.script(show_meta=True).encode()
         key_data = {
-            "version": __version__,
             "func": sha256(func_binary).hexdigest(),  # Use SHA256 to generate hash key
             "out_idx": (tuple(out_idx) if isinstance(out_idx, (list, tuple)) else [out_idx]),
-            "args_repr": tuple(
-                repr(arg) for arg in args
-            ),  # Use repr to serialize arguments, may need more robust serialization
+            "args_repr": tuple(repr(arg) for arg in args),  # Use repr to serialize arguments, may need more robust serialization
             "target": str(target),
             "target_host": str(target_host) if target_host else None,
             "execution_backend": execution_backend,
             "pass_configs": pass_configs,
             "compile_flags": compile_flags,
+            **self._get_base_key(),
         }
         # Sort keys to ensure consistency
         key_string = json.dumps(key_data, sort_keys=True)
@@ -115,27 +134,44 @@ def cached(
         func: PrimFunc = None,
         out_idx: list[int] = None,
         *args,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        verbose: bool = False,
-        pass_configs: dict = None,
+        target: str | Target,
+        target_host: str | Target | None = None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        verbose: bool,
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
     ) -> JITKernel:
         """
         Caches and reuses compiled kernels to avoid redundant compilation.
 
+        This is the ONLY place where environment variable processing, target normalization,
+        and execution backend resolution should happen. All compilation paths go through here.
+
         Args:
             func: Function to be compiled or a prepared PrimFunc
             out_idx: Indices specifying which outputs to return
-            target: Compilation target platform
+            target: Compilation target platform (None = read from TILELANG_TARGET env var)
             target_host: Host target platform
+            execution_backend: Execution backend (None = read from TILELANG_EXECUTION_BACKEND)
+            verbose: Enable verbose output (None = read from TILELANG_VERBOSE)
             *args: Arguments passed to func
 
         Returns:
             JITKernel: The compiled kernel, either freshly compiled or from cache
+
+        Environment Variables
+        ---------------------
+        TILELANG_TARGET : str
+            Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+        TILELANG_EXECUTION_BACKEND : str
+            Default execution backend. Defaults to "auto".
+        TILELANG_VERBOSE : str
+            Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
         """
+
         if not env.is_cache_enabled():
+            if verbose:
+                self.logger.info("Cache is disabled; compiling kernel without caching.")
             return JITKernel(
                 func,
                 out_idx=out_idx,
@@ -157,30 +193,35 @@ def cached(
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
+        if verbose:
+            self.logger.info(f"Generated cache key: {key} for kernel {get_prim_func_name(func, '<unknown>')}")
         with self._lock:
             # First check in-memory cache
             if key in self._memory_cache:
-                self.logger.warning("Found kernel in memory cache. For better performance," \
-                                    " consider using `@tilelang.jit` instead of direct kernel caching.")
+                # Include kernel name for easier debugging when hitting memory cache
+                kernel_name = get_prim_func_name(func, "<unknown>")
+                self.logger.warning(
+                    "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.",
+                    kernel_name,
+                )
                 return self._memory_cache[key]
 
             if verbose:
-                self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
+                self.logger.debug(f"Checking disk cache for kernel {get_prim_func_name(func, '<unknown>')}")
 
             # Then check disk cache
-            kernel = self._load_kernel_from_disk(key, target, target_host, out_idx,
-                                                 execution_backend, pass_configs, compile_flags,
-                                                 func, verbose)
+            kernel = self._load_kernel_from_disk(
+                key, target, target_host, out_idx, execution_backend, pass_configs, compile_flags, func, verbose
+            )
             if kernel is not None:
                 if verbose:
-                    self.logger.debug(
-                        f"Found kernel in disk cache for {func.attrs['global_symbol']}")
+                    self.logger.debug(f"Found kernel in disk cache for {get_prim_func_name(func, '<unknown>')}")
                 # Populate memory cache with disk result
                 self._memory_cache[key] = kernel
                 return kernel
 
         if verbose:
-            self.logger.debug(f"No cached kernel for {func.attrs['global_symbol']}")
+            self.logger.debug(f"No cached kernel for {get_prim_func_name(func, '<unknown>')}")
         # Compile kernel if cache miss; leave critical section
         kernel = JITKernel(
             func,
@@ -192,12 +233,12 @@ def cached(
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
-        if execution_backend in ("dlpack", "torch"):
-            self.logger.warning("DLPack or torch backend does not support cache saving to disk.")
-        else:
-            with self._lock:
-                if env.is_cache_enabled():
-                    self._save_kernel_to_disk(key, kernel, func, verbose)
+        with self._lock:
+            if env.is_cache_enabled():
+                cache_path = self._get_cache_path(key)
+                self._save_kernel_to_disk(key, kernel, func, verbose)
+                # Set cache path on adapter so it can save cubin after first execution
+                self._set_adapter_cache_path(kernel, cache_path)
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -239,11 +280,13 @@ def _safe_write_file(path: str, mode: str, operation: Callable):
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
-    def _save_kernel_to_disk(self,
-                             key: str,
-                             kernel: JITKernel,
-                             func: Callable = None,
-                             verbose: bool = False):
+    @classmethod
+    def _safe_write_executable(cls, executable: Executable, path: str):
+        temp_path = os.path.join(env.TILELANG_TMP_DIR, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path, **cls._get_compile_args())
+        os.replace(temp_path, path)
+
+    def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
@@ -265,71 +308,43 @@ def _save_kernel_to_disk(self,
 
         # Save kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
-            if verbose:
-                self.logger.debug(f"Saving kernel source code to file: {kernel_path}")
-            if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(kernel_path, "w",
-                                             lambda file: file.write(kernel.kernel_source))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel source code to disk: {e}")
+            self._save_kernel_source_code_to_disk(kernel, cache_path, verbose)
+        except Exception:
+            self.logger.exception("Error saving kernel source code to disk")
 
         # Save wrapped kernel source code
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-            if verbose:
-                self.logger.debug(
-                    f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            KernelCache._safe_write_file(
-                wrapped_kernel_path, "w",
-                lambda file: file.write(kernel.adapter.get_kernel_source()))
-        except Exception as e:
-            self.logger.error(f"Error saving wrapped kernel source code to disk: {e}")
+            self._save_wrapper_kernel_code_to_disk(kernel, cache_path, verbose)
+        except Exception:
+            self.logger.exception("Error saving host kernel source code to disk")
 
         # Save the kernel library
         try:
             # Save CUBIN or SO file
-            kernel_lib_path = KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH
-            kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            KernelCache._safe_write_file(
-                kernel_lib_path, "wb",
-                lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-
-            # Save an extra Python file for NVRTC
-            if self.execution_backend == "nvrtc":
-                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
-                src_lib_path = src_lib_path.replace(".cubin", ".py")
-                if verbose:
-                    self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                KernelCache._safe_write_file(
-                    kernel_py_path, "wb",
-                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel library to disk: {e}")
+            self._save_so_cubin_to_disk(kernel, cache_path, verbose)
+
+        except Exception:
+            self.logger.exception("Error saving kernel library to disk")
 
         # Save kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
+            params_path = os.path.join(cache_path, self.params_path)
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            KernelCache._safe_write_file(params_path, "wb",
-                                         lambda file: cloudpickle.dump(kernel.params, file))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel parameters to disk: {e}")
+            KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
+        except Exception:
+            self.logger.exception("Error saving kernel parameters to disk")
 
     def _load_kernel_from_disk(
         self,
         key: str,
         target: str | Target = "auto",
-        target_host: str | Target = None,
-        out_idx: list[int] = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        pass_configs: dict = None,
+        target_host: str | Target | None = None,
+        out_idx: list[int] | None = None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
-        func: Callable = None,
+        func: Callable | None = None,
         verbose: bool = False,
     ) -> JITKernel | None:
         """
@@ -340,7 +355,7 @@ def _load_kernel_from_disk(
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
             out_idx (List[int], optional): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
             verbose (bool): Enable verbose log messages.
@@ -349,50 +364,42 @@ def _load_kernel_from_disk(
             JITKernel: The loaded kernel if found, None otherwise.
         """
         cache_path = self._get_cache_path(key)
-        wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-        kernel_lib_path = os.path.join(
-            cache_path, KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH)
-        params_path = os.path.join(cache_path, PARAMS_PATH)
-        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
-            return None
+        device_kernel_path = os.path.join(cache_path, self.device_kernel_path)
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        params_path = os.path.join(cache_path, self.params_path)
 
-        kernel_global_source: str | None = None
-        kernel_params: list[KernelParam] | None = None
+        required_files = self._get_required_files(cache_path)
+
+        if not all([os.path.exists(file) for file in required_files]):
+            return None
 
         # Load the kernel source file (optional)
-        try:
-            if verbose:
-                self.logger.debug(
-                    f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
-        except Exception as e:
-            self.logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+        device_kernel_source, host_kernel_source = self._load_kernel_source(device_kernel_path, host_kernel_path, verbose)
 
         # Load kernel parameters
+        kernel_params: list[KernelParam] | None = None
         try:
             if verbose:
                 self.logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
-        except Exception as e:
-            self.logger.error(f"Error loading kernel parameters from disk: {e}")
-
-        if kernel_global_source and kernel_params:
-            return JITKernel.from_database(
-                func=func,
-                kernel_global_source=kernel_global_source,
-                kernel_lib_path=kernel_lib_path,
-                params=kernel_params,
-                target=target,
-                target_host=target_host,
-                out_idx=out_idx,
-                execution_backend=execution_backend,
-                pass_configs=pass_configs,
-                compile_flags=compile_flags,
-            )
-        else:
-            return None
+        except Exception:
+            self.logger.exception("Error loading kernel parameters from disk")
+
+        return self._build_kernel(
+            func=func,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
+            kernel_lib_path=kernel_lib_path,
+            kernel_params=kernel_params,
+            target=target,
+            target_host=target_host,
+            out_idx=out_idx,
+            execution_backend=execution_backend,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
 
     def _clear_disk_cache(self):
         """
@@ -408,5 +415,93 @@ def _clear_disk_cache(self):
 
             # Re-create the cache directory
             KernelCache._create_dirs()
-        except Exception as e:
-            self.logger.error(f"Error clearing disk cache: {e}")
+        except Exception:
+            self.logger.exception("Error clearing disk cache")
+
+    def _save_kernel_source_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        device_kernel_path = os.path.join(cache_path, self.device_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
+        if kernel.kernel_source is not None:
+            KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
+
+    def _save_wrapper_kernel_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+        KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_kernel_source()))
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        src_lib_path = kernel.adapter.libpath
+        if verbose:
+            self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+        KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+    def _get_required_files(self, cache_path: str) -> list[str]:
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        params_path = os.path.join(cache_path, self.params_path)
+        return [kernel_lib_path, params_path]
+
+    def _load_kernel_source(self, device_kernel_path: str, host_kernel_path: str, verbose: bool = False) -> tuple[str | None, str | None]:
+        try:
+            if verbose:
+                self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
+        except Exception:
+            device_kernel_source = None
+            self.logger.exception("Error loading kernel source code from disk")
+        try:
+            if verbose:
+                self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception:
+            host_kernel_source = None
+            self.logger.exception("Error loading host kernel source code from disk")
+        return device_kernel_source, host_kernel_source
+
+    def _set_adapter_cache_path(self, kernel: JITKernel, cache_path: str):
+        return
+
+    def _build_kernel(
+        self,
+        func: Callable | None,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        kernel_params: list[KernelParam] | None,
+        target: str | Target,
+        target_host: str | Target | None,
+        out_idx: list[int] | None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"],
+        pass_configs: dict | None,
+        compile_flags: list[str] | str | None,
+    ) -> JITKernel | None:
+        # Check all required components and report specific failures
+        missing_components = []
+        if not host_kernel_source:
+            missing_components.append("host_kernel_source")
+        if not device_kernel_source:
+            missing_components.append("device_kernel_source")
+        if not kernel_params:
+            missing_components.append("kernel_params")
+
+        if missing_components:
+            self.logger.warning("Cannot build kernel from cache: missing required component(s): %s", ", ".join(missing_components))
+            return None
+
+        return JITKernel.from_database(
+            func=func,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
+            kernel_lib_path=kernel_lib_path,
+            params=kernel_params,
+            target=target,
+            target_host=target_host,
+            out_idx=out_idx,
+            execution_backend=execution_backend,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md
index 498cf1571..f484f47d7 100644
--- a/tilelang/carver/README.md
+++ b/tilelang/carver/README.md
@@ -1,12 +1,12 @@
 # Carver: A Tile-Structure Based Hint Recommend Framework for Machine Learning Compilers
 
-**Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels. 
+**Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels.
 
-Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/LeiYanggh/tilelang) (or other domain-specific compilers).
+Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/tile-ai/tilelang) (or other domain-specific compilers).
 
 ---
 
-### Key Features
+## Key Features
 - **Unified Tiling Framework**: Generate tile candidates for multiple backends under a unified API.
 - **Architecture-Specific Modeling**: Take into account architecture constraints (e.g., CUDA `smem_cap`, warp size, CPU cache structure, etc.) when generating hints.
 - **Flexible Templates**: High-level templates (like `MatmulTemplate`, `GeneralReductionTemplate`, `ElementwiseTemplate`) let you concisely specify kernel structures.
@@ -33,8 +33,8 @@ arch = CUDA("nvidia/geforce-rtx-4090")
 #         for k in Reduce(1024):
 #             ...
 carve_template = carver.GeneralReductionTemplate(
-    structure="SSR",          
-    shape=[1024, 1024, 1024], 
+    structure="SSR",
+    shape=[1024, 1024, 1024],
     dtype="float16",
 ).with_arch(arch)
 
@@ -72,7 +72,6 @@ A tile structure composed of S and R can simulate various cases. For example, st
 
 We can specialize more advanced templates to provide finer-grained information, such as `MatmulTemplate`.
 
-
 ### Matmul Template
 
 Carver also provides a specialized `MatmulTemplate` for matrix multiplication (e.g., `C = A * B`), automatically inferring common tiling strategies (thread blocks, warps, use of tensor cores, etc.).
@@ -190,8 +189,6 @@ You might interpret this in **Triton** as:
 
 This helps quickly test multiple configurations without manually guessing.
 
-
-
 ## Supported Templates
 
 Carver abstracts common loop patterns through templates:
@@ -203,8 +200,6 @@ Carver abstracts common loop patterns through templates:
 
 You can also create your own specialized templates if you have unique loop structures or constraints. For instance, you might define specialized templates for convolution, flash attention, etc.
 
-
 ## TODO Items
 
 - [ ] **Adapt to tile language**: Provide ready-made scheduling calls or wrappers for [tilelang](https://github.com/LeiYanggh/tilelang) to streamline end-to-end integration.
-
diff --git a/tilelang/carver/__init__.py b/tilelang/carver/__init__.py
index 4ffd43644..f1dfc5b47 100644
--- a/tilelang/carver/__init__.py
+++ b/tilelang/carver/__init__.py
@@ -1,4 +1,5 @@
 """Base infra"""
+
 from .analysis import (
     BlockInfo,  # noqa: F401
     IterInfo,  # noqa: F401
diff --git a/tilelang/carver/analysis.py b/tilelang/carver/analysis.py
index 96606e790..6ca916818 100644
--- a/tilelang/carver/analysis.py
+++ b/tilelang/carver/analysis.py
@@ -1,4 +1,5 @@
 """Analysis on TIR blocks, loops and functions."""
+
 from __future__ import annotations
 from typing_extensions import Literal
 
@@ -144,11 +145,13 @@ def _iter_kind(i: tir.IterVar) -> str:
                         var=iter.var,
                         dom=iter.dom,
                         loop_rv=loop,
-                    ) for loop, iter in zip(loops, iters)
+                    )
+                    for loop, iter in zip(loops, iters)
                 ],
                 block_rv=block,
                 reduction_block=is_reduction,
-            ))
+            )
+        )
     return blocks
 
 
@@ -188,8 +191,7 @@ def get_max_shared_memory_per_block(target: Target) -> int:
     _assert_gpu_target(target)
     max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
     if max_shared_memory_per_block is None:
-        raise ValueError(
-            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
+        raise ValueError(f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
     return int(max_shared_memory_per_block)
 
 
@@ -197,13 +199,11 @@ def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
     try:
         block = sch.mod[func_name].body.block
     except Exception:
-        raise ValueError(f"The function body is expected to be the root block, but got:\n"
-                         f"{sch.mod[func_name].body}") from None
+        raise ValueError(f"The function body is expected to be the root block, but got:\n{sch.mod[func_name].body}") from None
     return sch.get_block(block.name_hint)
 
 
-def collect_block_iter_vars_used_in_access_region(block: tir.Block,
-                                                  region: list[ir.Range]) -> set[tir.Var]:
+def collect_block_iter_vars_used_in_access_region(block: tir.Block, region: list[ir.Range]) -> set[tir.Var]:
     """Collect the block iter variables used in the access region of a buffer region."""
     tir_vars = set()
     for expr in region:
@@ -251,15 +251,13 @@ def is_broadcast_epilogue(
     for buffer_region in sch.get(epilogue).reads:
         if buffer_region.buffer not in write_buffers:
             continue
-        tir_vars = collect_block_iter_vars_used_in_access_region(
-            sch.get(epilogue), buffer_region.region)
+        tir_vars = collect_block_iter_vars_used_in_access_region(sch.get(epilogue), buffer_region.region)
         if len(tir_vars) < len(epilogue_iters):
             return True
     return False
 
 
-def get_reduction_blocks(sch: tir.Schedule,
-                         blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
+def get_reduction_blocks(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
     # Get the main computation block
     def is_reduction(block: BlockRV) -> bool:
         block_stmt = sch.get(block)
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
index c2bc9c75d..b6cb9e72f 100644
--- a/tilelang/carver/arch/__init__.py
+++ b/tilelang/carver/arch/__init__.py
@@ -39,18 +39,18 @@ def auto_infer_current_arch() -> TileDevice:
 
 
 __all__ = [
-    'is_cpu_arch',
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
-    'is_cdna_arch',
-    'is_metal_arch',
-    'CUDA',
-    'CDNA',
-    'METAL',
-    'CPU',
+    "is_cpu_arch",
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
+    "is_cdna_arch",
+    "is_metal_arch",
+    "CUDA",
+    "CDNA",
+    "METAL",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/arch_base.py b/tilelang/carver/arch/arch_base.py
index a10fa434d..c5e9dfa68 100644
--- a/tilelang/carver/arch/arch_base.py
+++ b/tilelang/carver/arch/arch_base.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 class TileDevice:
     """
     Represents the architecture of a computing device, capturing various hardware specifications.
@@ -10,9 +7,7 @@ def __init__(self) -> None:
         self.reg_cap: int = 0  # Register capacity: The amount of register memory available
         self.smem_cap: int = 0  # Shared memory capacity: The amount of shared memory available
         self.compute_max_core: int = 0  # The maximum number of computing cores
-        self.warp_size: int = (
-            0  # The size of a warp, a group of threads that execute instructions in lockstep
-        )
+        self.warp_size: int = 0  # The size of a warp, a group of threads that execute instructions in lockstep
         self.sm_partition: int = 0  # The number of streaming multiprocessor partitions
         self.transaction_size: list[int] = [
             0,
@@ -24,9 +19,7 @@ def __init__(self) -> None:
             0,
         ]  # Bandwidth specifications, possibly including peak and sustained rates
         self.platform: str = "unknown"  # The platform or manufacturer of the device
-        self.compute_capability: str = (
-            "unknown"  # The compute capability, indicating the feature set and performance level
-        )
+        self.compute_capability: str = "unknown"  # The compute capability, indicating the feature set and performance level
         self.l2_cache_size_bytes: int = 0
         # the number of transaction size in bytes
         self.transaction_size: list[int] = [0, 0]  # in bytes
diff --git a/tilelang/carver/arch/cdna.py b/tilelang/carver/arch/cdna.py
index ec5aa905f..5c2d4c4ed 100644
--- a/tilelang/carver/arch/cdna.py
+++ b/tilelang/carver/arch/cdna.py
@@ -9,7 +9,6 @@ def is_cdna_arch(arch: TileDevice) -> bool:
 
 
 class CDNA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -33,6 +32,6 @@ def __init__(self, target: Target | str):
 
 
 __all__ = [
-    'is_cdna_arch',
-    'CDNA',
+    "is_cdna_arch",
+    "CDNA",
 ]
diff --git a/tilelang/carver/arch/cpu.py b/tilelang/carver/arch/cpu.py
index f4643baa0..fc18c6c8b 100644
--- a/tilelang/carver/arch/cpu.py
+++ b/tilelang/carver/arch/cpu.py
@@ -10,7 +10,6 @@ def is_cpu_arch(arch: TileDevice) -> bool:
 # For LLVM Backend, we do not provide the detailed information of the CPU
 # As the LLVM backend do not required tuning, just maintain the consistency
 class CPU(TileDevice):
-
     def __init__(self, target: Target):
         self.target = target
         device = tvm.runtime.cpu(0)
@@ -21,6 +20,6 @@ def __init__(self, target: Target):
 
 
 __all__ = [
-    'is_cpu_arch',
-    'CPU',
+    "is_cpu_arch",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/cuda.py b/tilelang/carver/arch/cuda.py
index 4c7f98dff..2b79b2832 100644
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -78,7 +78,6 @@ def has_mma_support(arch: TileDevice) -> bool:
 # instead of assuming both a and b share the same dtype.
 # As the tensorcore may supports float8_e4m3 * float8_e5m2
 def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
-
     if is_volta_arch(arch):
         return (in_dtype, accum_dtype) in volta_tensorcore_supported
     elif is_ampere_arch(arch):
@@ -92,7 +91,6 @@ def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: Til
 
 
 class TensorInstruction:
-
     def __init__(
         self,
         name: str,
@@ -104,7 +102,6 @@ def __init__(
 
 
 class CUDA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -148,12 +145,12 @@ def __repr__(self):
 
 
 __all__ = [
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
     "CUDA",
 ]
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index 337987dd8..a63127663 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -2,123 +2,54 @@
 import ctypes
 import sys
 
+try:
+    import torch.cuda._CudaDeviceProperties as _CudaDeviceProperties
+except ImportError:
+    _CudaDeviceProperties = type("DummyCudaDeviceProperties", (), {})
 
-class cudaDeviceProp(ctypes.Structure):
-    _fields_ = [
-        ("name", ctypes.c_char * 256),
-        ("uuid", ctypes.c_byte * 16),  # cudaUUID_t
-        ("luid", ctypes.c_char * 8),
-        ("luidDeviceNodeMask", ctypes.c_uint),
-        ("totalGlobalMem", ctypes.c_size_t),
-        ("sharedMemPerBlock", ctypes.c_size_t),
-        ("regsPerBlock", ctypes.c_int),
-        ("warpSize", ctypes.c_int),
-        ("memPitch", ctypes.c_size_t),
-        ("maxThreadsPerBlock", ctypes.c_int),
-        ("maxThreadsDim", ctypes.c_int * 3),
-        ("maxGridSize", ctypes.c_int * 3),
-        ("clockRate", ctypes.c_int),
-        ("totalConstMem", ctypes.c_size_t),
-        ("major", ctypes.c_int),
-        ("minor", ctypes.c_int),
-        ("textureAlignment", ctypes.c_size_t),
-        ("texturePitchAlignment", ctypes.c_size_t),
-        ("deviceOverlap", ctypes.c_int),
-        ("multiProcessorCount", ctypes.c_int),
-        ("kernelExecTimeoutEnabled", ctypes.c_int),
-        ("integrated", ctypes.c_int),
-        ("canMapHostMemory", ctypes.c_int),
-        ("computeMode", ctypes.c_int),
-        ("maxTexture1D", ctypes.c_int),
-        ("maxTexture1DMipmap", ctypes.c_int),
-        ("maxTexture1DLinear", ctypes.c_int),
-        ("maxTexture2D", ctypes.c_int * 2),
-        ("maxTexture2DMipmap", ctypes.c_int * 2),
-        ("maxTexture2DLinear", ctypes.c_int * 3),
-        ("maxTexture2DGather", ctypes.c_int * 2),
-        ("maxTexture3D", ctypes.c_int * 3),
-        ("maxTexture3DAlt", ctypes.c_int * 3),
-        ("maxTextureCubemap", ctypes.c_int),
-        ("maxTexture1DLayered", ctypes.c_int * 2),
-        ("maxTexture2DLayered", ctypes.c_int * 3),
-        ("maxTextureCubemapLayered", ctypes.c_int * 2),
-        ("maxSurface1D", ctypes.c_int),
-        ("maxSurface2D", ctypes.c_int * 2),
-        ("maxSurface3D", ctypes.c_int * 3),
-        ("maxSurface1DLayered", ctypes.c_int * 2),
-        ("maxSurface2DLayered", ctypes.c_int * 3),
-        ("maxSurfaceCubemap", ctypes.c_int),
-        ("maxSurfaceCubemapLayered", ctypes.c_int * 2),
-        ("surfaceAlignment", ctypes.c_size_t),
-        ("concurrentKernels", ctypes.c_int),
-        ("ECCEnabled", ctypes.c_int),
-        ("pciBusID", ctypes.c_int),
-        ("pciDeviceID", ctypes.c_int),
-        ("pciDomainID", ctypes.c_int),
-        ("tccDriver", ctypes.c_int),
-        ("asyncEngineCount", ctypes.c_int),
-        ("unifiedAddressing", ctypes.c_int),
-        ("memoryClockRate", ctypes.c_int),
-        ("memoryBusWidth", ctypes.c_int),
-        ("l2CacheSize", ctypes.c_int),
-        ("persistingL2CacheMaxSize", ctypes.c_int),
-        ("maxThreadsPerMultiProcessor", ctypes.c_int),
-        ("streamPrioritiesSupported", ctypes.c_int),
-        ("globalL1CacheSupported", ctypes.c_int),
-        ("localL1CacheSupported", ctypes.c_int),
-        ("sharedMemPerMultiprocessor", ctypes.c_size_t),
-        ("regsPerMultiprocessor", ctypes.c_int),
-        ("managedMemory", ctypes.c_int),
-        ("isMultiGpuBoard", ctypes.c_int),
-        ("multiGpuBoardGroupID", ctypes.c_int),
-        ("reserved2", ctypes.c_int * 2),
-        ("reserved1", ctypes.c_int * 1),
-        ("reserved", ctypes.c_int * 60)
-    ]
-
-
-def get_cuda_device_properties(device_id: int = 0) -> cudaDeviceProp | None:
-
-    if sys.platform == "win32":
-        libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
-    else:
-        libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
-
-    prop = cudaDeviceProp()
-    cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
-    cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
-    cudaGetDeviceProperties.restype = ctypes.c_int
-    ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
-    if ret == 0:
-        return prop
-    else:
-        raise RuntimeError(f"cudaGetDeviceProperties failed with error {ret}")
+
+class cudaDeviceAttrNames:
+    r"""
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def get_cuda_device_properties(device_id: int = 0) -> _CudaDeviceProperties | None:
+    try:
+        import torch.cuda
+
+        if not torch.cuda.is_available():
+            return None
+        return torch.cuda.get_device_properties(torch.device(device_id))
+    except ImportError:
+        return None
 
 
 def get_device_name(device_id: int = 0) -> str | None:
     prop = get_cuda_device_properties(device_id)
     if prop:
-        return prop.name.decode()
-    else:
-        raise RuntimeError("Failed to get device properties.")
+        return prop.name
 
 
 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerBlock)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    shared_mem = int(prop.shared_memory_per_block)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
+    else:
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_device_attribute(attr: int, device_id: int = 0) -> int:
@@ -130,7 +61,11 @@ def get_device_attribute(attr: int, device_id: int = 0) -> int:
 
         value = ctypes.c_int()
         cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
-        cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
+        cudaDeviceGetAttribute.argtypes = [
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
         cudaDeviceGetAttribute.restype = ctypes.c_int
 
         ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
@@ -148,28 +83,20 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
     Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
     """
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerMultiprocessor)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    shared_mem = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
     else:
-        raise RuntimeError("Failed to get device properties.")
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.persistingL2CacheMaxSize
-    else:
-        raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
+    prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
+    return prop
 
 
 def get_num_sms(device_id: int = 0) -> int:
@@ -186,15 +113,17 @@ def get_num_sms(device_id: int = 0) -> int:
         RuntimeError: If unable to get the device properties.
     """
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.multiProcessorCount
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    return prop.multi_processor_count
 
 
 def get_registers_per_block(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.regsPerBlock
-    else:
-        raise RuntimeError("Failed to get device properties.")
+    """
+    Get the maximum number of 32-bit registers available per block.
+    """
+    prop = get_device_attribute(
+        cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock,
+        device_id,
+    )
+    return prop
diff --git a/tilelang/carver/arch/metal.py b/tilelang/carver/arch/metal.py
index 9cd1c4d1e..0b76849a7 100644
--- a/tilelang/carver/arch/metal.py
+++ b/tilelang/carver/arch/metal.py
@@ -8,7 +8,6 @@ def is_metal_arch(arch: TileDevice) -> bool:
 
 
 class METAL(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = Target(target)
@@ -16,6 +15,6 @@ def __init__(self, target: Target | str):
 
 
 __all__ = [
-    'is_metal_arch',
-    'METAL',
+    "is_metal_arch",
+    "METAL",
 ]
diff --git a/tilelang/carver/common_schedules.py b/tilelang/carver/common_schedules.py
index 2766a15e3..4904b770d 100644
--- a/tilelang/carver/common_schedules.py
+++ b/tilelang/carver/common_schedules.py
@@ -19,7 +19,7 @@
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm common_schedules.py in dlight.
 """Common schedule strategies for TIR."""
-from __future__ import annotations
+
 from typing import Callable
 
 from tvm import tir
diff --git a/tilelang/carver/matmul_analysis.py b/tilelang/carver/matmul_analysis.py
index 02a86cc78..6d27de825 100644
--- a/tilelang/carver/matmul_analysis.py
+++ b/tilelang/carver/matmul_analysis.py
@@ -1,5 +1,6 @@
 # pylint: disable=missing-docstring, invalid-name
 """A GEMM schedule rule for GPU operators."""
+
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
@@ -157,8 +158,7 @@ def find_last_producer_from_buffer(sch, main_block, buffer: tir.Buffer) -> Block
     return block
 
 
-def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV,
-                                   buffer: tir.Buffer) -> int:
+def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV, buffer: tir.Buffer) -> int:
     """traverse to find the arg index from the buffer"""
     producers = sch.get_producers(main_block)
 
@@ -226,9 +226,7 @@ def make_iter_fusion_index_map(
         else:
             fused_iters[trait.kind] = v_i
 
-    final_indices: list[tir.PrimExpr] = [
-        fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order
-    ]
+    final_indices: list[tir.PrimExpr] = [fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order]
 
     return tir.IndexMap(input_iters, final_indices, None)
 
@@ -307,8 +305,7 @@ def get_access_axes(region: list[Range]) -> set[Var]:
     return A_traits, B_traits, C_traits, block_traits
 
 
-def get_index_map(block: tir.Block,
-                  layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
+def get_index_map(block: tir.Block, layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
     """Get index maps for the block
 
     Parameters
@@ -343,10 +340,7 @@ def get_ordered_axes(region: list[Range]) -> set[Var]:
         return axes
 
     def is_common_reduce(var: Var) -> bool:
-        for iter_var in block.iter_vars:
-            if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                return True
-        return False
+        return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block.iter_vars)
 
     def has_common_reduce(var: Var) -> bool:
         vars = collect_vars_from_expr(var)
@@ -384,17 +378,17 @@ def infer_layout(layout: str, region: list[Range], kind: str = "A"):
             if kind == "C":
                 return [IterKind.kIter_S, primary_iter, secondary_iter]
             else:
-                return ([IterKind.kIter_S, spatial_iter, reduction_iter] if check_last_trait(region)
-                        else [IterKind.kIter_S, reduction_iter, spatial_iter])
+                return (
+                    [IterKind.kIter_S, spatial_iter, reduction_iter]
+                    if check_last_trait(region)
+                    else [IterKind.kIter_S, reduction_iter, spatial_iter]
+                )
         else:
             raise ValueError(f"Unknown layout {layout}")
 
-    A_index_map = make_iter_fusion_index_map(
-        A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
-    B_index_map = make_iter_fusion_index_map(
-        B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
-    C_index_map = make_iter_fusion_index_map(
-        C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
+    A_index_map = make_iter_fusion_index_map(A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
+    B_index_map = make_iter_fusion_index_map(B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
+    C_index_map = make_iter_fusion_index_map(C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
 
     matmul_index_map = make_iter_fusion_index_map(
         block_traits,
@@ -429,8 +423,7 @@ def is_dequantize(block: BlockRV) -> bool:
         has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads)
         if not has_uint_input:
             return False
-        return not (len(block_stmt.writes) != 1 or
-                    "float" not in str(block_stmt.writes[0].buffer.dtype))
+        return not (len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype))
 
     dequantize_blocks = [block for block in blocks if is_dequantize(block)]
     return dequantize_blocks[0] if len(dequantize_blocks) == 1 else None
@@ -452,8 +445,7 @@ def get_access_vars(region: list[Range]) -> list[Var]:
                 return None
             axes.extend(undefined_vars(r.min))
         # remove trivial axis
-        trivial_vars = set(
-            iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
+        trivial_vars = set(iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
         axes = [axis for axis in axes if axis not in trivial_vars]
         # remove duplicate axis
         axes = [var for i, var in enumerate(axes) if i == 0 or var != axes[i - 1]]
@@ -462,8 +454,7 @@ def get_access_vars(region: list[Range]) -> list[Var]:
     lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:]
     rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:]
     is_identity = list(lhs_access_vars) == list(rhs_access_vars)
-    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(
-        rhs_access_vars)
+    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(rhs_access_vars)
     return is_identity, is_transpose
 
 
@@ -491,9 +482,7 @@ def inline_transpose_block(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]
     return result_blocks
 
 
-def normalize_to_matmul(sch: tir.Schedule,
-                        main_block: BlockRV,
-                        layout: list[str] | None = None) -> tir.Schedule | None:
+def normalize_to_matmul(sch: tir.Schedule, main_block: BlockRV, layout: list[str] | None = None) -> tir.Schedule | None:
     if layout is None:
         layout = ["n", "t", "n"]
     block_stmt = sch.get(main_block)
@@ -526,7 +515,7 @@ def get_tensorized_func_and_tags(
     allow_gemv: bool = False,
 ) -> tuple[tir.PrimFunc, dict[str, list[int] | int]]:
     """
-        transform function to matmul if necessary (e.g. transform conv2d with im2col)
+    transform function to matmul if necessary (e.g. transform conv2d with im2col)
     """
     if layout is None:
         layout = ["a", "a", "a"]
@@ -543,10 +532,7 @@ def _can_be_tensorized(sch: tir.Schedule, block: BlockRV) -> bool:
         conditions = []
         conditions.append(len(block_stmt.reads) == 2)
         conditions.append(len(block_stmt.writes) == 1)
-        conditions.append(
-            len(
-                collect_block_iter_vars_used_in_access_region(block_stmt,
-                                                              block_stmt.writes[0].region)) > 0)
+        conditions.append(len(collect_block_iter_vars_used_in_access_region(block_stmt, block_stmt.writes[0].region)) > 0)
         return all(conditions)
 
     # step2. transform function to tensorcore matmul (e.g. conv2d with im2col)
@@ -592,10 +578,7 @@ def get_ordered_axes(region: list[Range]) -> set[Var]:
             return axes
 
         def is_common_reduce(var: Var) -> bool:
-            for iter_var in block_stmt.iter_vars:
-                if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                    return True
-            return False
+            return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block_stmt.iter_vars)
 
         def has_common_reduce(var: Var) -> bool:
             vars = collect_vars_from_expr(var)
@@ -626,7 +609,7 @@ def check_last_trait(region: list[Range]):
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
         # And we only support float16 for now
-        if (hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]):
+        if hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]:
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
@@ -645,9 +628,7 @@ def check_last_trait(region: list[Range]):
     if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70:
         in_dtype, out_dtype = get_in_out_dtypes(block_stmt)
         if not is_tensorcore_supported_precision(in_dtype, out_dtype, arch=get_arch(target)):
-            logger.debug(
-                f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore"
-            )
+            logger.debug(f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore")
             return func, None
 
         # reindex and transform functions
@@ -676,7 +657,7 @@ def check_last_trait(region: list[Range]):
             else:
                 raise ValueError(f"Unknown IterVar type {iter_type}")
 
-            if (isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold):
+            if isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold:
                 return func, None
         tags = analysis_tensorcore_tags(sch, main_block, target)
         return sch.mod["main"], tags
@@ -686,8 +667,10 @@ def check_last_trait(region: list[Range]):
 
 def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", index_dtype="int32"):
     from bitblas.tl.mma_layout import (  # pylint: disable=import-outside-toplevel
-        ldmatrix_32x8_to_shared_16x16_layout, ldmatrix_trans_32x8_to_shared_16x16_layout,
-        ldmatrix_32x16_to_shared_16x32_layout_a, ldmatrix_32x16_to_shared_16x32_layout_b,
+        ldmatrix_32x8_to_shared_16x16_layout,
+        ldmatrix_trans_32x8_to_shared_16x16_layout,
+        ldmatrix_32x16_to_shared_16x32_layout_a,
+        ldmatrix_32x16_to_shared_16x32_layout_b,
     )
 
     assert dtype in [
@@ -727,9 +710,7 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
         return ldmatrix_layout(thread_id, local_id)
 
     if dtype in ["bfloat16", "float16"]:
-        ldmatrix_index_map = (
-            ldmatrix_trans_permutation_16x16_32x8_16x16
-            if trans else ldmatrix_permutation_16x16_32x8_16x16)
+        ldmatrix_index_map = ldmatrix_trans_permutation_16x16_32x8_16x16 if trans else ldmatrix_permutation_16x16_32x8_16x16
     else:
         ldmatrix_index_map = ldmatrix_permutation_16x32_32x16_32x16
 
@@ -744,7 +725,6 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
 # Ladder weight propagation, which can be used to avoid the ldmatrix
 # Instructions.
 def get_ladder_stage3_map(dtype="float16", index_dtype="int32"):
-
     def shared_32x8_to_mma_32x8_layout(i, j):
         thread_id = (i % 8) * 4 + (j // 2)
         local_id = (i // 8) * 2 + (j % 2)
@@ -837,8 +817,7 @@ def layout_propagate_chain(
                 scaling_factor = 1
                 for i, j in zip(write.buffer.shape, read.buffer.shape):
                     scaling_factor *= i // j
-                final_indices = list(
-                    index_map.map_indices(tmp_index_map.map_indices(write_indices)))
+                final_indices = list(index_map.map_indices(tmp_index_map.map_indices(write_indices)))
                 final_indices[-1] = final_indices[-1] // scaling_factor
                 index_map = IndexMap(
                     write_indices,
diff --git a/tilelang/carver/roller/bestfit.py b/tilelang/carver/roller/bestfit.py
index b66ceaae7..ec7817429 100644
--- a/tilelang/carver/roller/bestfit.py
+++ b/tilelang/carver/roller/bestfit.py
@@ -2,7 +2,6 @@
 
 
 class Block:
-
     def __init__(self, start, end, is_free):
         self.start = start
         self.end = end
@@ -21,7 +20,6 @@ def __repr__(self) -> str:
 
 
 class BestFit:
-
     def __init__(self, align=32):
         self.limit = 0
         self.list = []
@@ -31,16 +29,14 @@ def malloc(self, size) -> Block:
         size = (size + self.align - 1) // self.align * self.align
         found = None
         for block in self.list:
-            if block.is_free and block.size() >= size and (not found or
-                                                           found.size() > block.size()):
+            if block.is_free and block.size() >= size and (not found or found.size() > block.size()):
                 found = block
         if found:
             found.is_free = False
             remain = found.size() - size
             if remain != 0:
                 found.end -= remain
-                self.list.insert(
-                    self.list.index(found) + 1, Block(found.end, found.end + remain, True))
+                self.list.insert(self.list.index(found) + 1, Block(found.end, found.end + remain, True))
             return found
         elif len(self.list) > 0 and self.list[-1].is_free:
             add = size - self.list[-1].size()
diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py
index 20d62f68f..8fd1fb406 100644
--- a/tilelang/carver/roller/hint.py
+++ b/tilelang/carver/roller/hint.py
@@ -1,5 +1,5 @@
 """Hint definition for schedule"""
-from __future__ import annotations
+
 from tvm import DataType
 from . import PrimFuncNode
 import numpy as np
@@ -61,7 +61,7 @@ def compute_elements_from_shape(self, shape: list[int]) -> int:
             strided_elem = original_shape
         else:
             assert self.ax < len(shape)
-            strided_elem = np.prod(shape[0:self.ax + 1]) * self.stride
+            strided_elem = np.prod(shape[0 : self.ax + 1]) * self.stride
             assert strided_elem >= original_shape
         return int(strided_elem)
 
@@ -218,7 +218,7 @@ def to_dict(self) -> dict:
         return dic
 
     @classmethod
-    def from_dict(cls, dic: dict) -> Hint:
+    def from_dict(cls, dic: dict) -> "Hint":
         hint = cls()
         for k, v in dic.items():
             setattr(hint, k, v)
diff --git a/tilelang/carver/roller/node.py b/tilelang/carver/roller/node.py
index f9e38b168..3122c7b07 100644
--- a/tilelang/carver/roller/node.py
+++ b/tilelang/carver/roller/node.py
@@ -1,4 +1,5 @@
 """PrimFunc Wrapper and Block information Analaysis"""
+
 from __future__ import annotations
 
 import tvm
@@ -31,7 +32,6 @@ def _traverse(block):
 
 
 class BlockAnalyzer:
-
     def __init__(self, sch) -> None:
         self.sch: tir.Schedule = sch
         self.block_infos: list[BlockInfo] = normalize_prim_func(self.sch)
@@ -92,7 +92,6 @@ class Edge:
 
 
 class Node:
-
     def __init__(self, tags: dict | None = None, name: str = "Node") -> None:
         self.name = name
         if tags is None:
@@ -177,7 +176,6 @@ def __repr__(self) -> str:
 
 
 class PlaceHolderNode(Node):
-
     def __init__(self, name=""):
         super().__init__(name="PlaceHolder_" + name)
 
@@ -189,11 +187,7 @@ def get_ir(self) -> str:
 
 
 class PrimFuncNode(Node):
-
-    def __init__(self,
-                 prim_func: PrimFunc,
-                 tags: dict | None = None,
-                 name: str = "PrimFuncNode") -> None:
+    def __init__(self, prim_func: PrimFunc, tags: dict | None = None, name: str = "PrimFuncNode") -> None:
         super().__init__(tags, name=name)
         self.prim_func = self._specialize_func(prim_func)
         self.sch: tir.Schedule = tir.Schedule(self.prim_func)
@@ -227,7 +221,7 @@ def _assign_placeholder_node(self):
         for dst_id, n in enumerate(inputs):
             if isinstance(n, Node):
                 n = (n, 0)
-            assert (len(n) == 2)
+            assert len(n) == 2
             src_node, src_id = n[0], n[1]
             edge = Edge(src_node, self, src_id, dst_id)
             self._in_edges.append(edge)
@@ -338,9 +332,8 @@ def propagate(self, tile, rstep: dict | None = None, targets=None):
         if rstep is None:
             rstep = {}
         shape = {
-            self.block_analyzer.get_output_buffers(block)[0].name: [
-                tvm.arith.ConstIntBound(0, val - 1) for val in tile
-            ] for block in self.schedule_stages
+            self.block_analyzer.get_output_buffers(block)[0].name: [tvm.arith.ConstIntBound(0, val - 1) for val in tile]
+            for block in self.schedule_stages
         }
         return self.ana.infer(shape, rstep, targets)
 
@@ -356,10 +349,7 @@ def propagate_inputs(self, tile, rstep: dict | None = None) -> list[list[int]]:
                 results.append(shapes[arg.name])
                 continue
             # should not exceed original shape
-            trimmed_shape = [
-                self.extent_wrapper(i)
-                for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))
-            ]
+            trimmed_shape = [self.extent_wrapper(i) for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -380,10 +370,8 @@ def propagate_inputs_on_reduction(self, tile, rstep: dict | None = None) -> list
             propagate_shape = shapes[arg.name]
             buffer_shape = args[i].shape
             if len(buffer_shape) > len(propagate_shape):
-                buffer_shape = buffer_shape[-len(propagate_shape):]
-            trimmed_shape = [
-                self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))
-            ]
+                buffer_shape = buffer_shape[-len(propagate_shape) :]
+            trimmed_shape = [self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -412,10 +400,7 @@ def propagate_reduction_inputs(self, shape, rstep: dict | None = None) -> dict[s
     def get_reduce_inputs_dtype(self):
         if self.reduction_block is None:
             return {}
-        return {
-            b.name: tvm.DataType(b.dtype)
-            for b in self.block_analyzer.get_input_buffers(self.reduction_block)
-        }
+        return {b.name: tvm.DataType(b.dtype) for b in self.block_analyzer.get_input_buffers(self.reduction_block)}
 
     @functools.lru_cache
     def infer_tensorcore_axis(self) -> tuple[int]:
@@ -425,8 +410,7 @@ def infer_tensorcore_axis(self) -> tuple[int]:
         C_ax_m, C_ax_n = self.get_tag("tensorcore_config")
         wmma_m, wmma_n, wmma_k = [16, 16, 16]  # just for testing, any number is ok
 
-        output_buffer_shape = (
-            self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape)
+        output_buffer_shape = self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape
         valid_region = []
         for region in output_buffer_shape:
             if region.value == 1:
@@ -438,8 +422,7 @@ def infer_tensorcore_axis(self) -> tuple[int]:
 
         def get_cl_shapes(c_ax_m, c_ax_n, num_nvalid_regions):
             spatial_dim = self.get_space_dim()
-            assert len(valid_region) == len(
-                spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
+            assert len(valid_region) == len(spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
             cl_shapes = [1] * len(spatial_dim)
             cl_shapes[c_ax_m - num_nvalid_regions] = wmma_m
             cl_shapes[c_ax_n - num_nvalid_regions] = wmma_n
@@ -467,9 +450,11 @@ def footprint(self, shape, rstep, stride_map: dict | None = None) -> int:
         shapes, _ = self.propagate(shape, rstep)
 
         def is_broadcast_pattern(buffer, output_buffer):
-            return (buffer in self.args and
-                    len(shapes[output_buffer.name]) > len(shapes[buffer.name]) and
-                    np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name]))
+            return (
+                buffer in self.args
+                and len(shapes[output_buffer.name]) > len(shapes[buffer.name])
+                and np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name])
+            )
 
         def is_after_reduce_stage(block):
             if not self.reduction_block:
@@ -491,8 +476,8 @@ def is_after_reduce_stage(block):
             output_buffer = self.block_analyzer.get_output_buffers(block)[0]
             for buffer in self.block_analyzer.get_input_buffers(block):
                 cache = buffer.name not in cached_tensor and (
-                    is_broadcast_pattern(buffer, output_buffer) or
-                    self.block_analyzer.get_block_info(block).is_reduction())
+                    is_broadcast_pattern(buffer, output_buffer) or self.block_analyzer.get_block_info(block).is_reduction()
+                )
                 if not cache:
                     continue
                 cached_tensor.append(buffer.name)
@@ -500,8 +485,7 @@ def is_after_reduce_stage(block):
                     continue  # cache after reduce op can often reuse buffer in reduce stage
 
                 if buffer.name in stride_map:
-                    num_elem = stride_map[buffer.name].compute_elements_from_shape(
-                        shapes[buffer.name])
+                    num_elem = stride_map[buffer.name].compute_elements_from_shape(shapes[buffer.name])
                 else:
                     num_elem = np.prod(shapes[buffer.name])
                 buffer_len = num_elem * int((tvm.DataType(buffer.dtype).bits + 7) // 8)
@@ -514,7 +498,6 @@ def get_input_buffers(self) -> list[tir.Buffer]:
 
 
 class OutputNode(Node):
-
     def __init__(self, node, id=0):
         super().__init__(name="OutputNode")
         # connect node and output node
@@ -549,15 +532,16 @@ def topo_order(list_of_nodes) -> list[Node]:
                 input_ready_count[dst_node] = len(dst_node.inputs)
                 list_of_nodes.append(dst_node)
             input_ready_count[dst_node] -= 1
-            assert (input_ready_count[dst_node] >= 0)
+            assert input_ready_count[dst_node] >= 0
             if input_ready_count[dst_node] == 0:
                 ready.append(dst_node)
-    assert (len(list_of_nodes) == len(output_list))
+    assert len(list_of_nodes) == len(output_list)
     return output_list
 
 
 def find_topo_sort_priority(output_node_list) -> list[Node]:
     import sys
+
     sys.setrecursionlimit(10000)
 
     def topo_sort_get_layer(node, topo_layer):
@@ -576,9 +560,7 @@ def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
         visited.add(node)
-        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs],
-                                     key=lambda n: topo_layer[n],
-                                     reverse=True)
+        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs], key=lambda n: topo_layer[n], reverse=True)
         for n in ordered_input_nodes:
             topo_sort_dfs(n, visited, topo_order)
         topo_order.append(node)
@@ -591,7 +573,6 @@ def topo_sort_dfs(node, visited, topo_order):
 
 
 def find_topo_sort(output_node_list) -> list[Node]:
-
     def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
diff --git a/tilelang/carver/roller/policy/common.py b/tilelang/carver/roller/policy/common.py
index 747dddbb0..fb33eefdb 100644
--- a/tilelang/carver/roller/policy/common.py
+++ b/tilelang/carver/roller/policy/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import numpy as np
 
 
diff --git a/tilelang/carver/roller/policy/default.py b/tilelang/carver/roller/policy/default.py
index 161df27a7..d09216e1c 100644
--- a/tilelang/carver/roller/policy/default.py
+++ b/tilelang/carver/roller/policy/default.py
@@ -1,4 +1,5 @@
 """Policy for cuda core schedule"""
+
 from __future__ import annotations
 import functools
 import math
@@ -36,20 +37,14 @@ def __init__(self, arch: TileDevice, tags: dict | None = None) -> None:
         self.rasterization = NoRasterization()
 
     @classmethod
-    def from_prim_func(cls,
-                       func: tvm.tir.PrimFunc,
-                       arch: TileDevice,
-                       tags: dict | None = None,
-                       name: str = "PrimFuncNode"):
+    def from_prim_func(cls, func: tvm.tir.PrimFunc, arch: TileDevice, tags: dict | None = None, name: str = "PrimFuncNode"):
         return cls(arch, tags)._init_with_prim_func(func, name)
 
     @classmethod
     def from_output_nodes(cls, nodes: list[OutputNode], arch: TileDevice, tags: dict | None = None):
         return cls(arch, tags)._init_with_output_nodes(nodes)
 
-    def _init_with_prim_func(self,
-                             func: tvm.tir.PrimFunc,
-                             name: str = "PrimFuncNode") -> DefaultPolicy:
+    def _init_with_prim_func(self, func: tvm.tir.PrimFunc, name: str = "PrimFuncNode") -> DefaultPolicy:
         if func is not None and isinstance(func, tvm.tir.PrimFunc):
             self.func = func
             self.prim_func_node = PrimFuncNode(self.func, tags=self.tags, name=name)
@@ -60,9 +55,7 @@ def _init_with_prim_func(self,
         return self
 
     def _init_with_output_nodes(self, output_nodes: list[OutputNode]):
-        self.ordered_nodes = list(
-            filter(lambda n: not n.is_placeholder() and not n.is_output(),
-                   find_topo_sort(output_nodes)))
+        self.ordered_nodes = list(filter(lambda n: not n.is_placeholder() and not n.is_output(), find_topo_sort(output_nodes)))
         for node in self.ordered_nodes:
             node.update_tags(self.tags)
 
@@ -102,13 +95,14 @@ def emit_config(self, topk: int) -> list[Hint]:
 
     def dfs_smem_tile(self, init_tile, rstep_map) -> Iterable[TileDict]:
         _steps = [get_all_factors(n) for n in self.output_nodes[0].get_space_dim()]
-        steps = [step[step.index(t):] for step, t in zip(_steps, init_tile)]
+        steps = [step[step.index(t) :] for step, t in zip(_steps, init_tile)]
         for i in range(len(steps)):
             added = list(
                 filter(
                     lambda s: s < steps[i][-1] and s > steps[i][0] and s not in steps[i],
                     [2, 4, 8, 16, 32],
-                ))
+                )
+            )
             steps[i].extend(added)
             steps[i] = sorted(steps[i])
         visited_tiles = {}
@@ -190,10 +184,7 @@ def _get_output_tile_map(self, tile):
         """
         tile_map = {}
         for node in self.output_nodes:
-            tile_map[node] = [
-                tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i]
-                for i in range(len(tile))
-            ]
+            tile_map[node] = [tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i] for i in range(len(tile))]
         return tile_map
 
     def compute_workload_per_item(self, output_tile) -> float:
@@ -304,8 +295,7 @@ def _score(rstep_id):
             score = 0
             shape = node.propagate_inputs(tile, rstep=rstep)
             for i, input_buffer in enumerate(node.input_buffers):
-                read_transaction_elements = self.arch.transaction_size[1] // (
-                    (node.get_buffer_dtype(input_buffer).bits + 7) // 8)
+                read_transaction_elements = self.arch.transaction_size[1] // ((node.get_buffer_dtype(input_buffer).bits + 7) // 8)
                 score += sim(
                     int(coalesced_factor(shape[i], input_buffer.shape)),
                     read_transaction_elements,
@@ -380,17 +370,13 @@ def _enlarge(rstep_id):
                     return None
                 return max(candidates, key=lambda x: x[1])[0]
 
-            cur_rstep_id = {
-                k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-            }
+            cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
             new_rstep_map = rstep_map.copy()
             while True:
                 new_rstep_id = _enlarge(cur_rstep_id)
                 if new_rstep_id is None:
                     break
-                new_rstep_map[node] = {
-                    k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis
-                }
+                new_rstep_map[node] = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                 old_rstep_map = td.rstep_map
                 td.rstep_map = new_rstep_map
                 smem_usage, _ = self._compute_shared_memory_usage(td)
@@ -434,15 +420,14 @@ def _compute_memory_traffic(self, output_tile):
                 if edge.src_node.is_placeholder():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     read_transaction_elements = self.arch.transaction_size[1] // nbytes
-                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(),
-                                                      read_transaction_elements) * nbytes
+                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(), read_transaction_elements) * nbytes
             for edge in node.outputs:
                 if edge.dst_node.is_output():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     write_transaction_elements = self.arch.transaction_size[0] // nbytes
-                    traffic += coalesced_tensor_shape(output_shapes[edge.src_id],
-                                                      node.get_shape(edge.src_id),
-                                                      write_transaction_elements) * nbytes
+                    traffic += (
+                        coalesced_tensor_shape(output_shapes[edge.src_id], node.get_shape(edge.src_id), write_transaction_elements) * nbytes
+                    )
 
         return traffic, op_tile_map
 
@@ -487,10 +472,7 @@ def _compute_shared_memory_usage(self, td: TileDict):
         cached_tensors_map = {}
 
         def can_free(node, out_id):
-            for edge in node.outputs:
-                if edge.src_id == out_id and edge.dst_node not in processed:
-                    return False
-            return True
+            return all(not (edge.src_id == out_id and edge.dst_node not in processed) for edge in node.outputs)
 
         for node in self.ordered_nodes:
             node_internal_bytes, cached_tensors_map[node] = self.infer_node_smem_usage(td, node)
@@ -528,9 +510,7 @@ def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict):
         Tuple[Dict, Dict]
             A tuple of dictionaries containing the output strides and tensor strides.
         """
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         return output_strides, tensor_strides
 
@@ -551,8 +531,7 @@ def _compute_stride_map(self, td: TileDict):
         output_strides_map = {}
         tensor_strides_map = {}
         for node in self.ordered_nodes:
-            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(
-                node, td)
+            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(node, td)
         td.output_strides_map, td.tensor_strides_map = output_strides_map, tensor_strides_map
 
     def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
@@ -582,9 +561,7 @@ def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
         output_shape = self.output_nodes[0].get_space_dim()
         td.grid_size = int(np.prod([(y + x - 1) // x for x, y in zip(output_tile, output_shape)]))
         # estimated reg usage
-        reg_usage = int(2 * max([
-            np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes
-        ]))
+        reg_usage = int(2 * max([np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes]))
         if reg_usage > self.arch.reg_cap:
             td.valid = False
             return td
@@ -609,13 +586,10 @@ def check_tile_shape_isvalid(self, td: TileDict) -> bool:
         for node in self.ordered_nodes:
             if np.prod(td.get_tile(node)) == 0:
                 return False
-            node_grid_size = np.prod([
-                (y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())
-            ])
+            node_grid_size = np.prod([(y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())])
             if node_grid_size != td.grid_size:
                 return False
-            if (hasattr(node, "reduce_op") and node.reduce_op is not None and
-                    len(node.reduce_op.axis) == len(td.output_tile)):
+            if hasattr(node, "reduce_op") and node.reduce_op is not None and len(node.reduce_op.axis) == len(td.output_tile):
                 for i, tile_extent in enumerate(td.output_tile):
                     if node.reduce_op.axis[i].dom.extent % tile_extent:
                         return False
@@ -639,23 +613,22 @@ def recommend_block_size(self, td: TileDict) -> list[int]:
         node_space_sizes = [int(np.prod(td.get_tile(node))) for node in self.ordered_nodes]
         max_block_size = functools.reduce(math.gcd, node_space_sizes)
 
-        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(
-                node_space_sizes):
-            node_reduce_sizes = [
-                int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes
-            ]
+        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(node_space_sizes):
+            node_reduce_sizes = [int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes]
             total_sizes = [x * y for x, y in zip(node_space_sizes, node_reduce_sizes)]
             max_possible_size = functools.reduce(math.gcd, total_sizes)
             possible_block_sizes = list(
                 filter(
                     lambda x: x % max_block_size == 0 and x <= 1024,
                     get_all_factors(max_possible_size),
-                ))
+                )
+            )
             possible_block_sizes = list(
                 filter(  # either be a factor of space or cover fully cover the space
                     lambda x: all([x % s == 0 or s % x == 0 for s in node_space_sizes]),
                     possible_block_sizes,
-                ))
+                )
+            )
             factor_ordered = sorted(possible_block_sizes, key=self.score_block_size)
             return factor_ordered
         else:
@@ -821,8 +794,7 @@ def is_type_allowed(dtype, vec):
         vectorize_result = {}
         for tensor, shape in shapes.items():
             for v in vectorize_sizes:
-                if (is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and
-                        is_type_allowed(dtypes[tensor], v)):
+                if is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and is_type_allowed(dtypes[tensor], v):
                     vectorize_result[tensor] = v
                     break
         return vectorize_result
diff --git a/tilelang/carver/roller/policy/tensorcore.py b/tilelang/carver/roller/policy/tensorcore.py
index 15bad4122..86c79ea73 100644
--- a/tilelang/carver/roller/policy/tensorcore.py
+++ b/tilelang/carver/roller/policy/tensorcore.py
@@ -1,4 +1,5 @@
 """Policy for tensorcore schedule"""
+
 from __future__ import annotations
 import tvm
 import numpy as np
@@ -13,7 +14,6 @@
 
 
 class TensorCorePolicy(DefaultPolicy):
-
     # this is the trick for wmma.
     # However, for int8 mma, the wmma_k should be 32.
     wmma_k: int = 16
@@ -70,9 +70,9 @@ def _compute_tc_strides(
         A_high_ax = min(A_ax_m, A_ax_k)
         B_high_ax = min(B_ax_n, B_ax_k)
         C_high_ax = min(C_ax_m, C_ax_n)
-        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1:]) + offset, ax=A_high_ax)
-        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1:]) + offset, ax=B_high_ax)
-        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1:]) + offset, ax=C_high_ax)
+        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1 :]) + offset, ax=A_high_ax)
+        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1 :]) + offset, ax=B_high_ax)
+        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1 :]) + offset, ax=C_high_ax)
         return A_stride, B_stride, C_stride
 
     def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode):
@@ -86,8 +86,7 @@ def _assign_reduce_step(self, node):
         # get reduce input size
         target_transaction = self.arch.transaction_size[0] * 2
         # 512 bytes // type bits
-        reduce_input_dtype = node.get_buffer_dtype(
-            node.block_analyzer.get_input_buffers(node.reduction_block)[0])
+        reduce_input_dtype = node.get_buffer_dtype(node.block_analyzer.get_input_buffers(node.reduction_block)[0])
         basic = (target_transaction * 8) // reduce_input_dtype.bits
 
         result = {}
@@ -95,7 +94,7 @@ def _assign_reduce_step(self, node):
             iter_name = iter_info.var.name
             iter_dom = iter_info.dom.extent
             if iter_dom % 16 > 0:
-                result[iter_name] = (16 if iter_dom < basic else basic)  # for the case of padding
+                result[iter_name] = 16 if iter_dom < basic else basic  # for the case of padding
             elif iter_dom % basic == 0:
                 result[iter_name] = basic
             else:
@@ -114,7 +113,6 @@ def _check_small_tile(td: TileDict):
             return False
 
         if _check_small_tile(td):
-
             smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
             rstep_map = td.rstep_map.copy()
 
@@ -127,13 +125,10 @@ def _optimize(node, rstep):
                     return rstep
 
                 def _shared_memory_usage(td: TileDict):
-                    return node.footprint(td.output_tile, new_rstep_map,
-                                          td.tensor_strides_map[node])
+                    return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node])
 
                 def _score(rstep_id):
-                    rstep = {
-                        k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
-                    }
+                    rstep = {k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis}
                     score = 0
                     shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
                     input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
@@ -153,18 +148,13 @@ def _enlarge(rstep_id):
                         return None
                     return max(candidates, key=lambda x: x[1])[0]
 
-                cur_rstep_id = {
-                    k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-                }
+                cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
                 new_rstep_map = rstep_map.copy()
                 while True:
                     new_rstep_id = _enlarge(cur_rstep_id)
                     if new_rstep_id is None:
                         break
-                    new_rstep_map = {
-                        k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]]
-                        for k in node.raxis
-                    }
+                    new_rstep_map = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                     old_rstep_map = td.rstep_map
                     td.rstep_map = new_rstep_map
                     smem_usage, _ = _shared_memory_usage(td)
@@ -173,9 +163,7 @@ def _enlarge(rstep_id):
                         break
                     else:
                         cur_rstep_id = new_rstep_id
-                rstep = {
-                    k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
-                }
+                rstep = {k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis}
                 return rstep
 
             for node in self.ordered_nodes:
@@ -206,11 +194,7 @@ def get_node_reduce_step_candidates(self, node):
             return super().get_node_reduce_step_candidates(node)
         else:
             # must be a a multiple of wmma_k
-            return {
-                k.var.name: [
-                    x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)
-                ] for k in node.raxis
-            }
+            return {k.var.name: [x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)] for k in node.raxis}
 
     def check_tile_shape_isvalid(self, td: TileDict):
         for node in self.ordered_nodes:
@@ -221,10 +205,7 @@ def check_tile_shape_isvalid(self, td: TileDict):
                     td.tile_map[node][ax_n],
                 )
                 # check the tile size is valid
-                wmma_invalid = [
-                    block_m < wmma_m or block_n < wmma_n
-                    for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()
-                ]
+                wmma_invalid = [block_m < wmma_m or block_n < wmma_n for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()]
                 if all(wmma_invalid):
                     return False
                 if any([y % x for x, y in zip(td.tile_map[node], node.get_space_dim())]):
@@ -242,13 +223,10 @@ def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict):
             return super().compute_node_stride_map(node, td)
         use_layout = self._can_implement_layout(node, td)
 
-        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node),
-                                                                  td.get_rstep(node))
+        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node), td.get_rstep(node))
         A_stride, B_stride, _ = self._compute_tc_strides(node, td.get_tile(node))
         tensor_strides = {}
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         # when connected to shared input, should use full stride without rstep
         for i, (_, _) in enumerate(zip([AS_stride, BS_stride], [A_stride, B_stride])):
@@ -347,8 +325,7 @@ def _check_memory_size():
             overall_gmem_size_in_bytes: int = 0
             for node in self.ordered_nodes:
                 for buffer in node.input_buffers:
-                    overall_gmem_size_in_bytes += (
-                        int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8)
+                    overall_gmem_size_in_bytes += int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8
             return overall_gmem_size_in_bytes < self.arch.l2_cache_size_bytes
 
         conditions.append(_check_memory_size())
diff --git a/tilelang/carver/roller/rasterization.py b/tilelang/carver/roller/rasterization.py
index 39c603b6b..ec565a1c7 100644
--- a/tilelang/carver/roller/rasterization.py
+++ b/tilelang/carver/roller/rasterization.py
@@ -1,9 +1,7 @@
 """Rasteration Plan For L2 Cache Locality"""
-from __future__ import annotations
 
 
 class Rasterization:
-
     panel_width_ = None
 
     def __init__(self) -> None:
@@ -19,7 +17,6 @@ def panel_width(self):
 
 
 class NoRasterization(Rasterization):
-
     def __init__(self) -> None:
         super().__init__()
 
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index aaf59aed9..c29ae4129 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -1,13 +1,10 @@
-from __future__ import annotations
 from collections import OrderedDict
 
 from tvm import arith
 
 
 class Statement:
-
-    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict,
-                 range_map: OrderedDict):
+    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict):
         self.output = output
         self.dependent_region = dependent_region
         self.var_map = var_map
@@ -19,7 +16,6 @@ def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
 
diff --git a/tilelang/carver/roller/shape_inference/tir.py b/tilelang/carver/roller/shape_inference/tir.py
index 675298c69..d7b11d608 100644
--- a/tilelang/carver/roller/shape_inference/tir.py
+++ b/tilelang/carver/roller/shape_inference/tir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from collections.abc import Mapping
 from tvm.tir.schedule.schedule import BlockRV
 from tvm.ir import structural_equal
@@ -6,7 +5,6 @@
 
 
 class Statement:
-
     def __init__(self, block_analyzer, block: BlockRV):
         self.block_analyzer = block_analyzer
         self.block = block
@@ -22,9 +20,7 @@ def make_reverse(self, input_name: str, input_iter: list[tir.PrimExpr]):
         if len(self.dependent_region[input_name]) != 1:
             return None
         indices = self.dependent_region[input_name][0]
-        iter_map_range = {
-            _iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)
-        }
+        iter_map_range = {_iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)}
         iter_map_result = arith.detect_iter_map(
             indices,
             iter_map_range,
@@ -78,7 +74,6 @@ def __repr__(self):
 
 
 class DependencyAnalysis:
-
     def __init__(self, deps):
         self.deps = deps
         # issue: duplicate name when we have two same ops.
@@ -113,8 +108,7 @@ def get_or_create_node(self, name):
 
     def traverse_dependencies(self, compute):
         if isinstance(compute, Statement):
-            node = self.get_or_create_node(
-                compute.block_analyzer.get_output_buffers(compute.block)[0].name)
+            node = self.get_or_create_node(compute.block_analyzer.get_output_buffers(compute.block)[0].name)
             # Loop through input tensors
             for input_buffer in compute.block_analyzer.get_input_buffers(compute.block):
                 # Get the input node
@@ -168,7 +162,6 @@ def _find_path_recursive(self, current_node, target_name, visited, path):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
         self.target_mapping = {}
@@ -184,16 +177,11 @@ def construct_dependency_target(self, targets: tuple[str]):
         if targets in self.target_mapping:
             return self.target_mapping[targets]
         # should be buffer name instead of block name
-        name2dep = {
-            dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps
-        }
+        name2dep = {dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps}
         mapping = {}
         input_vars = []
         for target in targets:
-            vars = [
-                iter.var
-                for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)
-            ]
+            vars = [iter.var for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)]
             input_vars.append(vars)
             mapping[target] = [vars]
         ana = arith.Analyzer()
@@ -222,13 +210,8 @@ def construct_dependency_target(self, targets: tuple[str]):
                     mapping[input_name] = []
                 for indices in indices_list:
                     for region in regions:
-                        vmap = {
-                            k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v)
-                            for k, v in zip(ax_vars, indices)
-                        }
-                        region = [
-                            ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region
-                        ]
+                        vmap = {k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v) for k, v in zip(ax_vars, indices)}
+                        region = [ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region]
                         if not region_exist_in_list(region, mapping[input_name]):
                             mapping[input_name].append(region)
         buffers = []
@@ -242,10 +225,7 @@ def construct_dependency_target(self, targets: tuple[str]):
         self.target_mapping[targets] = input_vars, mapping
         return input_vars, mapping
 
-    def infer(self,
-              shape: dict[str, list[arith.ConstIntBound]],
-              rstep: dict[str, int] = None,
-              targets=None):
+    def infer(self, shape: dict[str, list[arith.ConstIntBound]], rstep: dict[str, int] = None, targets=None):
         if rstep is None:
             rstep = {}
         compute_targets = tuple(shape.keys())
@@ -259,8 +239,7 @@ def infer(self,
         for ax in self.reduce_axes:
             # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value.
             if ax.var.name in rstep:
-                bound = arith.ConstIntBound(
-                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
+                bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
             else:
                 bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1))
             ana.update(ax.var, bound, True)
@@ -313,14 +292,11 @@ def get_input_exprs(self, output_exprs):
 
         for name, regions in mapping.items():
             region = regions[0]
-            result[name] = [
-                ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region
-            ]
+            result[name] = [ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region]
         return result
 
 
 def region_exist_in_list(a, list) -> bool:
-
     def expr_is_same(a, b) -> bool:
         if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm):
             return a.value == b.value
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index 5aa5074c2..4a699fbc7 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -1,9 +1,13 @@
 # Import necessary modules and classes
-from __future__ import annotations
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
-    TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch)
+    TileDevice,
+    is_volta_arch,
+    is_ampere_arch,
+    is_cdna_arch,
+    auto_infer_current_arch,
+)
 from ..roller.hint import Hint  # Import the Hint class
 from ..roller.node import OutputNode  # Import the OutputNode class
 from tvm.tir import PrimFunc  # Import PrimFunc for handling tensor IR functions
@@ -42,7 +46,7 @@ def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) ->
         """
         pass
 
-    def with_arch(self, arch: TileDevice) -> BaseTemplate:
+    def with_arch(self, arch: TileDevice) -> "BaseTemplate":
         """
         Sets the architecture for this template and returns itself.
 
@@ -110,7 +114,7 @@ def initialize_function(self) -> None:
         """
         raise NotImplementedError("initialize_function is not implemented")
 
-    def set_function(self, func: PrimFunc) -> BaseTemplate:
+    def set_function(self, func: PrimFunc) -> "BaseTemplate":
         """
         Sets the function for this template and returns itself.
 
@@ -123,7 +127,7 @@ def set_function(self, func: PrimFunc) -> BaseTemplate:
         self._func = func
         return self
 
-    def set_output_nodes(self, output_nodes: list[OutputNode]) -> BaseTemplate:
+    def set_output_nodes(self, output_nodes: list[OutputNode]) -> "BaseTemplate":
         """
         Sets the output nodes for this template and returns itself.
 
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index f180084d5..c339e5894 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te, tir
@@ -29,6 +28,7 @@ class ConvTemplate(BaseTemplate):
         accum_dtype (str): Data type used for accumulation.
         with_bias (bool): Whether to add a bias term.
     """
+
     # Operation-related configuration parameters
     N: int  # The number of input samples processed simultaneously in a batch.
     C: int  # The number of input feature maps.
@@ -70,12 +70,18 @@ def initialize_function(self) -> None:
             AssertionError: If N, C, H, W, F, K, S, D, P are not positive integers.
         """
         N, C, H, W, F, K, S, D, P = self.N, self.C, self.H, self.W, self.F, self.K, self.S, self.D, self.P
-        assert (isinstance(N, int) and isinstance(C, int) and isinstance(H, int) and
-                isinstance(W, int) and isinstance(F, int) and isinstance(K, int) and
-                isinstance(S, int) and isinstance(D, int) and
-                isinstance(P, int)), "Only Support Integer Params"
-        assert (N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and
-                P > 0), "Params should be positive"
+        assert (
+            isinstance(N, int)
+            and isinstance(C, int)
+            and isinstance(H, int)
+            and isinstance(W, int)
+            and isinstance(F, int)
+            and isinstance(K, int)
+            and isinstance(S, int)
+            and isinstance(D, int)
+            and isinstance(P, int)
+        ), "Only Support Integer Params"
+        assert N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and P > 0, "Params should be positive"
 
         # Load configuration parameters
         in_dtype, out_dtype, accum_dtype = self.in_dtype, self.out_dtype, self.accum_dtype
@@ -124,8 +130,10 @@ def _compute_conv(n, h, w, f):
                 te.if_then_else(
                     te.all(h_in >= 0, h_in < H, w_in >= 0, w_in < W),
                     A[n, h_in, w_in, c].astype(accum_dtype) * B[kh, kw, c, f].astype(accum_dtype),
-                    tir.const(0, accum_dtype)),
-                axis=[kh, kw, c])
+                    tir.const(0, accum_dtype),
+                ),
+                axis=[kh, kw, c],
+            )
 
         # Compute convolution result
         C = te.compute(
diff --git a/tilelang/carver/template/elementwise.py b/tilelang/carver/template/elementwise.py
index 26d531529..8cd306198 100644
--- a/tilelang/carver/template/elementwise.py
+++ b/tilelang/carver/template/elementwise.py
@@ -1,5 +1,4 @@
 # Import necessary modules
-from __future__ import annotations
 from dataclasses import dataclass  # Used for defining data classes
 from .base import BaseTemplate  # Importing the base class for templates
 from tvm import te  # Importing TVM's tensor expression module
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index 760b19817..933ab9585 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -10,7 +9,6 @@
 
 @dataclass
 class FlashAttentionTemplate(BaseTemplate):
-
     _output_nodes: list[OutputNode] = None
 
     # Operation-related configuration parameters
@@ -92,10 +90,7 @@ def _compute_matmul(b, i, j):
                 """
                 A_indices = [b, i, k]
                 B_indices = [b, j, k]
-                return te.sum(
-                    A[tuple(A_indices)].astype(accum_dtype) *
-                    B[tuple(B_indices)].astype(accum_dtype),
-                    axis=k)
+                return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
             # Compute matrix multiplication result
             C = te.compute(
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index 7195a0b87..e7962f6ad 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -51,9 +50,8 @@ def initialize_function(self) -> None:
         N, K = self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_B = self.trans_B
@@ -87,9 +85,7 @@ def _compute_matmul(i, j):
             """
             A_indices = [i, k]
             B_indices = [k, j] if not trans_B else [j, k]
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/template/general_reduce.py b/tilelang/carver/template/general_reduce.py
index a8da5fd6c..b7a55157c 100644
--- a/tilelang/carver/template/general_reduce.py
+++ b/tilelang/carver/template/general_reduce.py
@@ -9,15 +9,13 @@
 
 @dataclass
 class GeneralReductionTemplate(BaseTemplate):
-
     # OP Related Config
     structure: str | list[str] = None
     shape: list[int] = None
     dtype: str = "float16"
 
     def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) -> list[Hint]:
-        roller_hints = get_roller_hints_from_func(
-            self._func, arch=arch, topk=topk, allow_gemv=False)
+        roller_hints = get_roller_hints_from_func(self._func, arch=arch, topk=topk, allow_gemv=False)
         return roller_hints
 
     def initialize_function(self) -> None:
@@ -38,9 +36,9 @@ def initialize_function(self) -> None:
         spatial_axes = []
         reduce_axes = []
         for i, axis_type in enumerate(self.structure):
-            if axis_type.upper() == 'S':
+            if axis_type.upper() == "S":
                 spatial_axes.append((i, self.shape[i]))
-            elif axis_type.upper() == 'R':
+            elif axis_type.upper() == "R":
                 reduce_axes.append((i, self.shape[i]))
             else:
                 raise ValueError(f"Unrecognized axis type '{axis_type}', only 'S'/'R' allowed.")
@@ -90,7 +88,7 @@ def compute_func(*spatial_indices):
 
             # Walk through the structure in order
             for axis_type in self.structure:
-                if axis_type.upper() == 'S':
+                if axis_type.upper() == "S":
                     # use the next spatial_indices item
                     full_index.append(spatial_indices[spatial_iter])
                     spatial_iter += 1
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index 4847cdb22..57c92beb7 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
@@ -66,9 +65,8 @@ def initialize_function(self) -> None:
         M, N, K = self.M, self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_A, trans_B = self.trans_A, self.trans_B
@@ -102,9 +100,7 @@ def _compute_matmul(i, j):
             """
             A_indices = [i, k] if not trans_A else [k, i]  # Adjust indexing if A is transposed
             B_indices = [k, j] if not trans_B else [j, k]  # Adjust indexing if B is transposed
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/utils.py b/tilelang/carver/utils.py
index cedb7547a..67db89e39 100644
--- a/tilelang/carver/utils.py
+++ b/tilelang/carver/utils.py
@@ -26,11 +26,9 @@ def get_rasterization_code(pannel_width: int = 8) -> str:
     """
 
 
-def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
-                               arch: TileDevice,
-                               topk: int = 10,
-                               tensorcore_only: bool = False,
-                               allow_gemv: bool = False) -> list[Hint] | None:
+def get_roller_hints_from_func(
+    func_or_module: tir.PrimFunc | IRModule, arch: TileDevice, topk: int = 10, tensorcore_only: bool = False, allow_gemv: bool = False
+) -> list[Hint] | None:
     func = None
     if isinstance(func_or_module, tir.PrimFunc):
         func = func_or_module
@@ -44,8 +42,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     roller_hints = None
     if tensorcore_only:
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -58,8 +55,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
         policy = DefaultPolicy.from_prim_func(func=func, arch=arch)
         tensorized_func = None
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -69,10 +65,9 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     return roller_hints
 
 
-def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
-                                       arch: TileDevice,
-                                       topk: int = 10,
-                                       extra_tags: list[str] | None = None) -> list[Hint] | None:
+def get_roller_hints_from_output_nodes(
+    output_nodes: list[OutputNode], arch: TileDevice, topk: int = 10, extra_tags: list[str] | None = None
+) -> list[Hint] | None:
     assert isinstance(output_nodes, list), "The input should be a list of functions."
 
     lints = []
@@ -80,8 +75,7 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
         policy = TensorCorePolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
         lints = policy.emit_config(topk)
     except Exception as e_msg:
-        logger.debug(f"Generate hints from output nodes failed: {e_msg}",
-                     "fallback to default policy")
+        logger.debug(f"Generate hints from output nodes failed: {e_msg}", "fallback to default policy")
 
     if len(lints) == 0:
         policy = DefaultPolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
@@ -92,7 +86,6 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
 def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index 0807c2552..7dc459770 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Util to invoke C/C++ compilers in the system."""
-from __future__ import annotations
+
 import functools
 import os
 import shutil
@@ -31,8 +31,7 @@
 
 
 def _is_linux_like():
-    return (sys.platform == "darwin" or sys.platform.startswith("linux") or
-            sys.platform.startswith("freebsd"))
+    return sys.platform == "darwin" or sys.platform.startswith("linux") or sys.platform.startswith("freebsd")
 
 
 def _is_windows_like():
@@ -91,7 +90,7 @@ def get_cplus_compiler():
 
 
 def is_darwin():
-    return platform.system() == 'Darwin'
+    return platform.system() == "Darwin"
 
 
 def create_shared(output, objects, options=None, cc=None, cwd=None, ccache_env=None):
@@ -288,11 +287,7 @@ def get_target_triple():
 create_shared.get_target_triple = get_target_by_dump_machine(os.environ.get("CXX", get_cc()))
 
 
-def cross_compiler(compile_func,
-                   options=None,
-                   output_format=None,
-                   get_target_triple=None,
-                   add_files=None):
+def cross_compiler(compile_func, options=None, output_format=None, get_target_triple=None, add_files=None):
     """Create a cross compiler function by specializing compile_func with options.
 
     This function can be used to construct compile functions that
@@ -364,13 +359,7 @@ def _fcompile(outputs, objects, options=None):
     return _fcompile
 
 
-def _linux_compile(output,
-                   objects,
-                   options,
-                   compile_cmd,
-                   cwd=None,
-                   ccache_env=None,
-                   compile_shared=False):
+def _linux_compile(output, objects, options, compile_cmd, cwd=None, ccache_env=None, compile_shared=False):
     cmd = [compile_cmd]
     if compile_cmd != "nvcc":
         if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
@@ -431,15 +420,15 @@ def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
             raise ValueError("ccache not found")
 
     try:
-        proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("Can not find the LLVM clang for Windows clang.exe)."
-                           "Make sure it's installed"
-                           " and the installation directory is in the %PATH% environment "
-                           "variable. Prebuilt binaries can be found at: https://llvm.org/") \
-                               from None
+        raise RuntimeError(
+            "Can not find the LLVM clang for Windows clang.exe)."
+            "Make sure it's installed"
+            " and the installation directory is in the %PATH% environment "
+            "variable. Prebuilt binaries can be found at: https://llvm.org/"
+        ) from None
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += " ".join(cmd) + "\n"
diff --git a/tilelang/contrib/cutedsl/__init__.py b/tilelang/contrib/cutedsl/__init__.py
new file mode 100644
index 000000000..1028badea
--- /dev/null
+++ b/tilelang/contrib/cutedsl/__init__.py
@@ -0,0 +1,128 @@
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import nvvm
+from cutlass.cutlass_dsl import T
+
+# re-export cutlass.cute.arch functions first
+from cutlass.cute.arch import sync_threads  # noqa: F401
+from cutlass.cute.arch import alloc_smem, get_dyn_smem  # noqa: F401
+from cutlass.cute.arch import warpgroup_reg_alloc, warpgroup_reg_dealloc  # noqa: F401
+
+from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr  # noqa: F401
+from cutlass.cute.typing import Numeric
+
+from cutlass.base_dsl.typing import as_numeric, Int32, Uint16, Uint32  # noqa: F401
+from cutlass._mlir.dialects import llvm, arith  # noqa: F401
+from cutlass._mlir import ir as mlir_ir
+from cutlass.cutlass_dsl import dsl_user_op
+
+# Import our custom implementations (will override if names conflict)
+from .mbar import *
+from .cpasync import *
+from .gemm_V1 import *
+from .reduce import *
+from .ldsm import *
+from .math import *
+from .threadblock_swizzle import *
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def make_filled_tensor(shape, value):
+    t = cute.make_rmem_tensor(shape, type(value))
+    t.fill(value)
+    return t
+
+
+def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
+    if div_by != 1:
+        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
+    return cute.make_tensor(ptr + offset, shape)
+
+
+def shuffle_elect(thread_extent):
+    # thread_extent is the number of threads of a warpgroup
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    if thread_extent == 0:
+        return warp_idx == 0
+    else:
+        return (warp_idx % (thread_extent // 32)) == 0
+
+
+def sync_thread_partial(barrier_id=None, thread_count=None):
+    bar_sync_ptx(barrier_id, thread_count)
+
+
+# Packing functions
+def pack_half2(x, y):
+    """
+    Pack two half-precision (fp16) values into a single 32-bit value.
+    Corresponds to CUDA's __pack_half2 intrinsic.
+
+    This packs two fp16 values into a single int32 by treating the fp16 bits
+    as raw data and concatenating them.
+    """
+
+    @dsl_user_op
+    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
+        # Cast fp16 to uint16 (bitcast)
+        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
+        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
+
+        # Bitcast fp16 to i16
+        i16_type = mlir_ir.IntegerType.get_signless(16)
+        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
+        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
+
+        packed_xy = llvm.inline_asm(
+            Int32.mlir_type,
+            [x_i16, y_i16],
+            "mov.b32 $0, {$1, $2};",
+            "=r,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        return Int32(packed_xy)
+
+    return pack_half2_impl(x, y)
+
+
+def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    if ptr.dtype == cutlass.Float32:
+        ret = nvvm.atomicrmw(
+            T.f32(),
+            AtomicOpKind.FADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        raise ValueError(f"Unsupported dtype: {ptr.dtype}")
+    return ptr.dtype(ret)
diff --git a/tilelang/contrib/cutedsl/cpasync.py b/tilelang/contrib/cutedsl/cpasync.py
new file mode 100644
index 000000000..c5a4742a1
--- /dev/null
+++ b/tilelang/contrib/cutedsl/cpasync.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op  # noqa: F401
+
+from cutlass._mlir.dialects import nvvm, cute_nvgpu  # noqa: F401
+from cutlass._mlir import ir
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Union  # noqa: F401
+from cutlass.impl_utils import check_value_in
+
+from cutlass.cute.arch import cp_async_commit_group as cp_async_commit  # noqa: F401
+from cutlass.cute.arch import cp_async_wait_group as cp_async_wait  # noqa: F401
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def cp_async_gs(size, dst, src):
+    assert size in [16, 8, 4]
+    # use CG (cache global) to by pass L1 when loading contiguous 128B.
+    mode = nvvm.LoadCacheModifierKind.CG if size == 16 else nvvm.LoadCacheModifierKind.CA
+    if isinstance(src, cute.Tensor):
+        src_ptr = src.iterator
+    elif isinstance(src, cute.Pointer):
+        src_ptr = src
+    else:
+        raise ValueError(f"Invalid source type: {type(src)}")
+    if isinstance(dst, cute.Tensor):
+        dst_ptr = dst.iterator
+    elif isinstance(dst, cute.Pointer):
+        dst_ptr = dst
+    else:
+        raise ValueError(f"Invalid destination type: {type(dst)}")
+    cp_async_shared_global(dst_ptr, src_ptr, size, mode)
+
+
+@cute.jit
+def cp_async_gs_conditional(size, dst, src, cond):
+    if cond:
+        cp_async_gs(size, dst, src)
+
+
+@dsl_user_op
+def extract_tensormap_ptr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    extract the tensormap pointer from a TMA Copy Atom.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    """
+    exec_value = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    ptr_type = _cute_ir.PtrType.get(Uint64.mlir_type, _cute_ir.AddressSpace.generic, 64)
+    tensormap_ptr = _cute_nvgpu_ir.get_tma_desc_addr(ptr_type, exec_value, loc=loc, ip=ip)
+    return tensormap_ptr
+
+
+@dsl_user_op
+def tma_load(tma_desc, mbar: cute.Pointer, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Load data from global memory to shared memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  CopyAtom or tensormap_ptr or Tensor of tensormap_ptr
+    :param mbar:                     Mbarrier pointer in shared memory
+    :type mbar:                      Pointer
+    :param smem_ptr:                 Destination pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    if not isinstance(crd, tuple) and isinstance(tma_desc, cute.Pointer):
+        # Legacy signature: tma_load(smem_ptr, gmem_ptr, mbar, size)
+        _smem_ptr = tma_desc
+        _gmem_ptr = mbar
+        _mbar = smem_ptr
+        nvvm.cp_async_bulk_shared_cluster_global(
+            dst_mem=_smem_ptr.llvm_ptr,
+            src_mem=_gmem_ptr.llvm_ptr,
+            mbar=_mbar.llvm_ptr,
+            size=Int32(crd).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_shared_cluster_global(
+            dst_mem=smem_ptr.llvm_ptr,
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            mbar=mbar.llvm_ptr,
+            im2col_offsets=[],
+            load_mode=nvvm.CpAsyncBulkTensorLoadMode.TILE,
+            group=nvvm.Tcgen05GroupKind.CTA_1,
+            use_intrinsic=False,  # set to True would lead to compile error
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Store data from shared memory to global memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  TMA descriptor
+    :param smem_ptr:                 Source pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    if not isinstance(crd, tuple):
+        if arch not in ("sm_90", "sm_90a"):
+            raise NotImplementedError("tma_store(size) path is only implemented for sm_90/sm_90a")
+        gmem_ptr = tma_desc.align(smem_ptr.alignment)
+        _cute_nvgpu_ir.arch_copy_SM90_bulk_copy_s2g(
+            dsmem_data_addr=smem_ptr.value,
+            gmem_data_addr=gmem_ptr.value,
+            size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), crd),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_global_shared_cta(
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            src_mem=smem_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            predicate=None,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store_arrive(*, loc=None, ip=None) -> None:
+    """
+    Indicate arrival of warp issuing TMA_STORE.
+    Corresponds to PTX instruction: cp.async.bulk.commit_group;
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def tma_store_wait(count: int, *, read=None, loc=None, ip=None) -> None:
+    """
+    Wait for TMA_STORE operations to complete.
+    Corresponds to PTX instruction: cp.async.bulk.wait_group.read <count>;
+
+    :param count: The number of outstanding bulk async groups to wait for
+    :type count: Int
+    """
+    nvvm.cp_async_bulk_wait_group(group=count, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_shared_global(
+    dst: cute.Pointer, src: cute.Pointer, cp_size: Int, modifier: nvvm.LoadCacheModifierKind, *, src_size: Int = None, loc=None, ip=None
+) -> None:
+    """
+    Asynchronously copy data from global memory to shared memory.
+
+    :param dst: Destination pointer in shared memory
+    :type dst: Pointer
+    :param src: Source pointer in global memory
+    :type src: Pointer
+    :param size: Size of the copy in bytes
+    :type size: Int
+    :param modifier: Cache modifier
+    :type modifier: Int
+    :param cp_size: Optional copy size override
+    :type cp_size: Int
+    """
+    size = src_size if src_size else cp_size
+    nvvm.cp_async_shared_global(
+        dst=dst.llvm_ptr,
+        src=src.llvm_ptr,
+        size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), size),
+        modifier=modifier,
+        cp_size=Int32(cp_size).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def prefetch_tma_descriptor(tma_desc, *, loc=None, ip=None) -> None:
+    """
+    Prefetch a TMA descriptor.
+    Corresponds to PTX instruction: prefetch.tensormap;
+    """
+    if isinstance(tma_desc, cute.CopyAtom):
+        tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+    elif isinstance(tma_desc, cute.Tensor):
+        tma_desc_ptr = tma_desc.iterator
+    else:
+        tma_desc_ptr = tma_desc
+    nvvm.prefetch_tensormap(tma_desc_ptr.llvm_ptr, loc=loc, ip=ip)
diff --git a/tilelang/contrib/cutedsl/gemm_V1.py b/tilelang/contrib/cutedsl/gemm_V1.py
new file mode 100644
index 000000000..0f6cc71e9
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_V1.py
@@ -0,0 +1,569 @@
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils  # noqa: F401
+import math
+import cutlass.utils.hopper_helpers as hopper_utils
+from cutlass.utils import LayoutEnum
+from cutlass.cute.nvgpu.warpgroup import OperandMajorMode, OperandSource, make_smem_layout_atom
+
+
+def make_aligned_tensor(ptr: cute.Pointer, layout: cute.Layout, align_bytes: int, swizzle=False):
+    ptr = ptr.align(align_bytes)
+    if swizzle and isinstance(layout, cute.ComposedLayout):
+        ptr = cute.recast_ptr(ptr=ptr, swizzle_=layout.inner, dtype=ptr.dtype)
+        return cute.make_tensor(ptr, layout.outer)
+    return cute.make_tensor(ptr, layout)
+
+
+def gemm_ss(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rs(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from register/fragment and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_sr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from shared memory and B from register/fragment"""
+    # wgmma doesn't support gemm_sr, only use SM80
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    gemm.body_sr(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from register/fragment"""
+    # Both operands in register, no copy needed
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    # For gemm_rr, directly call _body_impl with copy_A=False, copy_B=False
+    gemm._body_impl(A_ptr, B_ptr, C_ptr, copy_A=False, copy_B=False)
+
+
+class Gemm_SM80:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.mma_inst_shape = (16, 8, 16)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self._make_smem_layout_AB(A_type, A_major_mode, 128, (M, K))
+            self.B_layout = self._make_smem_layout_AB(B_type, B_major_mode, 128, (N, K))
+            self.ab_dtype = A_type
+            self.acc_dtype = C_type
+            self.tiled_mma = self._make_tiled_mma(warp_m, warp_n)
+            self.clear_accum = clear_accum
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        is_row_major = major_mode == LayoutEnum.ROW_MAJOR
+        major_mode_size = smem_tiler[1] if is_row_major else smem_tiler[0]
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if is_row_major
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1) if is_row_major else (1, 0))
+        return layout
+
+    def _make_tiled_mma(self, warp_m, warp_n):
+        atom_layout_mnk = (warp_m, warp_n, 1)
+        op = cute.nvgpu.warp.MmaF16BF16Op(self.ab_dtype, self.acc_dtype, self.mma_inst_shape)
+        permutation_mnk = (
+            atom_layout_mnk[0] * self.mma_inst_shape[0],
+            atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+        tiled_mma = cute.make_tiled_mma(op, atom_layout_mnk, permutation_mnk)
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body: both A and B from shared memory"""
+        self._body_impl(sA_ptr, sB_ptr, rC_ptr, copy_A=True, copy_B=True)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_rs: A from register, B from shared memory"""
+        self._body_impl(rA_ptr, sB_ptr, rC_ptr, copy_A=False, copy_B=True)
+
+    @cute.jit
+    def body_sr(
+        self,
+        sA_ptr: cute.Pointer,  # A from shared memory
+        rB_ptr: cute.Pointer,  # B already in register
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_sr: A from shared memory, B from register"""
+        self._body_impl(sA_ptr, rB_ptr, rC_ptr, copy_A=True, copy_B=False)
+
+    @cute.jit
+    def _body_impl(
+        self,
+        A_ptr: cute.Pointer,
+        B_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        copy_A: cutlass.Constexpr = True,
+        copy_B: cutlass.Constexpr = True,
+    ):
+        """Internal implementation with configurable copy operations"""
+        tidx, _, _ = cute.arch.thread_idx()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        tCrA = None
+        tCrB = None
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Create copy operations only for operands that need copying
+        if cutlass.const_expr(copy_A):
+            sA = make_aligned_tensor(A_ptr, self.A_layout, 16)
+            tCsA = thr_mma.partition_A(sA)
+            tCrA = self.tiled_mma.make_fragment_A(tCsA)
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_A, 4),
+                sA.element_type,
+            )
+            tiled_copy_s2r_A = cute.make_tiled_copy(
+                atom_copy_s2r_A,
+                layout_tv=self.tiled_mma.tv_layout_A_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(0), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        else:
+            # A already in register
+            tCrA = cute.make_tensor(A_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        if cutlass.const_expr(copy_B):
+            sB = make_aligned_tensor(B_ptr, self.B_layout, 16)
+            tCsB = thr_mma.partition_B(sB)
+            tCrB = self.tiled_mma.make_fragment_B(tCsB)
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_B, 4),
+                sB.element_type,
+            )
+            tiled_copy_s2r_B = cute.make_tiled_copy(
+                atom_copy_s2r_B,
+                layout_tv=self.tiled_mma.tv_layout_B_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(1), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+        else:
+            # B already in register
+            tCrB = cute.make_tensor(B_ptr, self.tiled_mma.partition_shape_B((self.cta_tiler[1], self.cta_tiler[2])))
+
+        if self.clear_accum:
+            tCrC.fill(0)
+
+        for k in cutlass.range(cute.size(tCrA, mode=[2])):
+            if cutlass.const_expr(copy_A):
+                cute.copy(tiled_copy_s2r_A, tCsA_copy_view[None, None, k], tCrA_copy_view[None, None, k])
+            if cutlass.const_expr(copy_B):
+                cute.copy(tiled_copy_s2r_B, tCsB_copy_view[None, None, k], tCrB_copy_view[None, None, k])
+            cute.gemm(self.tiled_mma, tCrC, tCrA[None, None, k], tCrB[None, None, k], tCrC)
+
+
+class Gemm_SM90:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.tiler_mn = (M, N)
+            self.atom_layout_mnk = (warp_m // 4, warp_n, 1)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            self.a_leading_mode = OperandMajorMode.MN if self.trans_A else OperandMajorMode.K
+            self.b_leading_mode = OperandMajorMode.MN if self.trans_B else OperandMajorMode.K
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self.make_smem_layout_AB(A_type, A_major_mode, (M, K))
+            self.B_layout = self.make_smem_layout_AB(B_type, B_major_mode, (N, K))
+            self.a_dtype = A_type
+            self.b_dtype = B_type
+            self.acc_dtype = C_type
+            self.tiled_mma = None
+            self.A_source = None
+            self.clear_accum = clear_accum
+
+    @staticmethod
+    def make_tma_atom(
+        tensor,
+        smem_layout_staged,
+        smem_tile,
+        mcast_dim,
+    ):
+        op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp() if mcast_dim == 1 else cute.nvgpu.cpasync.CopyBulkTensorTileG2SMulticastOp()
+
+        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
+
+        tma_atom, tma_tensor = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+
+        return tma_atom
+
+    @staticmethod
+    def get_tma_atom(tensor, tiler_mk, stages=1):
+        smem_layout_staged = Gemm_SM90.make_smem_layout_AB(tensor.element_type, LayoutEnum.from_tensor(tensor), tiler_mk, stages)
+        tma_atom = Gemm_SM90.make_tma_atom(tensor, smem_layout_staged, tiler_mk, 1)
+        return tma_atom
+
+    @staticmethod
+    def make_smem_layout_AB(dtype, major_mode: LayoutEnum, tiler_mk, stages=1):
+        smem_shape = tiler_mk
+        # Determine if K is the major mode and get the major mode size
+        is_k_major = major_mode.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K
+        major_mode_size = tiler_mk[1] if is_k_major else tiler_mk[0]
+
+        # Create SMEM layout atom for A tensor based on major mode and data type
+        smem_layout_atom = make_smem_layout_atom(
+            hopper_utils.get_smem_layout_atom(major_mode, dtype, major_mode_size),
+            dtype,
+        )
+        # Tile the SMEM layout atom to the A tensor shape and add staging dimension
+        smem_layout = cute.tile_to_shape(smem_layout_atom, cute.append(smem_shape, stages), order=(0, 1, 2) if is_k_major else (1, 0, 2))
+        return smem_layout
+
+    def _make_tiled_mma(self, is_rsMode=False):
+        tiled_mma = hopper_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_leading_mode,
+            self.b_leading_mode,
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            (64, self.tiler_mn[1] // self.atom_layout_mnk[1]),
+            OperandSource.SMEM if not is_rsMode else OperandSource.RMEM,
+        )
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        sA_ptr = cute.recast_ptr(sA_ptr, self.A_layout.inner, dtype=sA_ptr.dtype)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sA = cute.make_tensor(sA_ptr, self.A_layout.outer)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+
+        tCrA = self.tiled_mma.make_fragment_A(tCsA)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        cute.nvgpu.warpgroup.fence()
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k in cutlass.range(num_k_blocks):
+            tCrA_1phase = tCrA[None, None, k, 0]
+            tCrB_1phase = tCrB[None, None, k, 0]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_1phase, tCrB_1phase, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register (Fragment)
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        """
+        GEMM body_rs for SM90/Hopper: A from register, B from shared memory.
+        Based on cute::tl_wgmma::GemmTensorOp::body_rs from gemm_sm90.h
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma(is_rsMode=True)
+        # if self.A_source != OperandSource.RMEM or self.tiled_mma is None:
+        #     self.tiled_mma = self._make_tiled_mma(is_rsMode = True)
+        #     self.A_source = OperandSource.RMEM
+        # B from shared memory (with swizzle)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        # Use the existing tiled_mma
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        # Partition B from shared memory - standard path
+        tCsB = thr_mma.partition_B(sB)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+
+        # A already in register
+        # For body_rs, A is NOT partitioned through thr_mma (it's already partitioned)
+        # We create the tensor directly with the full shape
+        # This matches C++: make_tensor(make_rmem_ptr(pA), partition_shape_A(...))
+        tCrA = cute.make_tensor(rA_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        # C accumulator
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Fence operands (prepare for wgmma)
+        cute.nvgpu.warpgroup.fence()
+        # Note: warpgroup_arrive() is called internally by wgmma
+        # Set accumulation mode
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        # GEMM loop
+        num_k_blocks = cute.size(tCrB, mode=[2])
+        for k_block in cutlass.range(num_k_blocks):
+            # Match the indexing pattern from __call__
+            # If tCrB has 4 dimensions (with pipeline), use [None, None, k, 0]
+            # Otherwise use [None, None, k]
+            tCrB_k = tCrB[None, None, k_block, 0] if cute.rank(tCrB) >= 4 else tCrB[None, None, k_block]
+            tCrA_k = tCrA[None, None, k_block, 0] if cute.rank(tCrA) >= 4 else tCrA[None, None, k_block]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_k, tCrB_k, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
diff --git a/tilelang/contrib/cutedsl/ldsm.py b/tilelang/contrib/cutedsl/ldsm.py
new file mode 100644
index 000000000..4f3602697
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ldsm.py
@@ -0,0 +1,127 @@
+"""
+LDMATRIX and STMATRIX operations for CuTeDSL backend.
+Based on tl_templates/cuda/ldsm.h
+
+These functions provide wrappers around PTX ldmatrix/stmatrix instructions
+for loading/storing 8x8 matrix fragments between shared memory and registers.
+"""
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir import ir  # noqa: F401
+from cutlass.cute.typing import Pointer, Int32  # noqa: F401
+import cutlass.cute as cute
+
+
+def _to_ir_value(v, loc=None, ip=None):
+    """Convert value to MLIR IR, handling both cutlass types and raw MLIR Values"""
+    if hasattr(v, "ir_value"):
+        return v.ir_value(loc=loc, ip=ip)
+    else:
+        # Already an MLIR Value
+        return v
+
+
+def _ldmatrix(smem_ptr, local_ptr, num, transpose, loc=None, ip=None):
+    """Internal helper for ldmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    assert num in [2, 4]
+    ret_type = llvm.StructType.get_literal([T.i32()] * num)
+    out_i32 = nvvm.ldmatrix(ret_type, smem_ptr.llvm_ptr, num=num, layout=layout, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), num)
+    for i in range(num):
+        out[i] = cute.Int32(llvm.extractvalue(T.i32(), out_i32, [i], loc=loc, ip=ip))
+
+
+def _stmatrix(smem_ptr, values, transpose, loc=None, ip=None):
+    """Internal helper for stmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    ir_values = [_to_ir_value(v, loc, ip) for v in values]
+    nvvm.stmatrix(smem_ptr.llvm_ptr, ir_values, layout=layout, loc=loc, ip=ip)
+
+
+# ============================================================================
+# LDMATRIX operations (load from shared memory to registers)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) from shared memory"""
+    # _ldmatrix(smem_ptr, local_ptr, 1, False, loc, ip)
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.row, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) with transpose from shared memory"""
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.col, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, True, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, True, loc, ip)
+
+
+# ============================================================================
+# STMATRIX operations (store from registers to shared memory)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_stmatrix_x1(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) to shared memory"""
+    _stmatrix(smem_ptr, [value0], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x1_trans(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2_trans(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4_trans(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], True, loc, ip)
diff --git a/tilelang/contrib/cutedsl/math.py b/tilelang/contrib/cutedsl/math.py
new file mode 100644
index 000000000..1ca14f929
--- /dev/null
+++ b/tilelang/contrib/cutedsl/math.py
@@ -0,0 +1,35 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Union, Numeric
+from cutlass.cute.tensor import TensorSSA
+from cutlass._mlir.dialects import arith, math
+from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt  # noqa: F401
+
+from cutlass._mlir.dialects import llvm
+from cutlass.base_dsl.typing import Float32
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+
+def divf(x: Union[TensorSSA, Numeric], y: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    return cute.math._math_op(arith.divf, fastmath, x, y)
+
+
+@dsl_user_op
+def __tanhf(x: Union[float, Float32], *, fastmath, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(x).ir_value()],
+            "tanh.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+def tanh(x: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    tanh_op = __tanhf if fastmath else math.tanh
+    return cute.math._math_op(tanh_op, False, x)
diff --git a/tilelang/contrib/cutedsl/mbar.py b/tilelang/contrib/cutedsl/mbar.py
new file mode 100644
index 000000000..ca956e2f4
--- /dev/null
+++ b/tilelang/contrib/cutedsl/mbar.py
@@ -0,0 +1,45 @@
+"""
+Simple wrappers that delegate to cutlass.cute.arch implementations.
+We use the existing implementations from cutlass rather than reinventing the wheel.
+"""
+
+from cutlass.cute.typing import Pointer, Int, Int32, Boolean  # noqa: F401
+from cutlass.cutlass_dsl import CuTeDSL, dsl_user_op  # noqa: F401
+from cutlass._mlir.dialects import nvvm
+
+from cutlass.cute.arch import mbarrier_init, mbarrier_expect_tx, mbarrier_arrive  # noqa: F401
+from cutlass.cute.arch import mbarrier_arrive_and_expect_tx as arrive_and_expect_tx  # noqa: F401
+from cutlass.cute.arch import cp_async_mbarrier_arrive_noinc as mbarrier_cp_async_arrive_noinc  # noqa: F401
+
+import cutlass.cute.arch as arch
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, timeout_ns: Int = 10000000, *, loc=None, ip=None) -> None:
+    """Waits on a mbarrier with a specified phase."""
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_cp_async_arrive(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=False,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def fence_proxy_async():
+    arch.fence_proxy(arch.ProxyKind.async_shared, space=arch.SharedSpace.shared_cta)
+
+
+def fence_barrier_init():
+    arch.mbarrier_init_fence()
diff --git a/tilelang/contrib/cutedsl/reduce.py b/tilelang/contrib/cutedsl/reduce.py
new file mode 100644
index 000000000..f835b149b
--- /dev/null
+++ b/tilelang/contrib/cutedsl/reduce.py
@@ -0,0 +1,186 @@
+"""
+Reduce operations for CuTeDSL backend.
+Based on tl_templates/cuda/reduce.h
+"""
+
+from __future__ import annotations
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.typing import Int32, Float32
+from cutlass.cutlass_dsl import dsl_user_op, T
+from cutlass._mlir.dialects import nvvm
+from cutlass.cute.arch.nvvm_wrappers import shuffle_sync_op
+
+
+@dsl_user_op
+def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmin(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+class SumOp:
+    """Sum reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x + y
+
+
+class MaxOp:
+    """Max reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return max(x, y)
+
+
+class MinOp:
+    """Min reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        # Use cutlass.min which is JIT-friendly
+        return min(x, y)
+
+
+class BitAndOp:
+    """Bitwise AND reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x & y
+
+
+class BitOrOp:
+    """Bitwise OR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x | y
+
+
+class BitXorOp:
+    """Bitwise XOR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x ^ y
+
+
+def bar_sync(barrier_id, number_of_threads):
+    cute.arch.barrier(barrier_id=barrier_id, number_of_threads=number_of_threads)
+
+
+def bar_sync_ptx(barrier_id, number_of_threads):
+    from cutlass._mlir.dialects import llvm
+
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(number_of_threads).ir_value()],
+        "bar.sync $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
+    """
+    AllReduce operation implementing warp/block-level reduction.
+    Based on tl::AllReduce from reduce.h
+
+    Args:
+        reducer: Reducer operator class (SumOp, MaxOp, etc.)
+        threads: Number of threads participating in reduction
+        scale: Reduction scale factor
+        thread_offset: Thread ID offset
+        all_threads: Total number of threads in block
+
+    Returns:
+        A callable object with run() and run_hopper() methods
+    """
+
+    class AllReduceInstance:
+        def __init__(self, reducer, threads, scale, thread_offset: cutlass.Constexpr[int], all_threads: cutlass.Constexpr[int]):
+            self.reducer = reducer
+            self.threads = threads
+            self.scale = scale
+            self.thread_offset = thread_offset
+            self.all_threads = all_threads if all_threads is not None else threads
+
+        def run(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce across threads.
+            Based on tl::AllReduce<...>::run from reduce.h
+            """
+            offset = self.threads // 2
+
+            if offset >= 32:
+                # Use shared memory for large thread counts
+                cute.arch.sync_threads()
+                tidx, _, _ = cute.arch.thread_idx()
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                cute.arch.sync_threads()
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run(x, red_buf)
+            )
+
+        def run_hopper(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce on Hopper architecture using bar.sync.
+            Based on tl::AllReduce<...>::run_hopper from reduce.h
+            """
+            offset = self.threads // 2
+            tidx, _, _ = cute.arch.thread_idx()
+            if offset >= 32:
+                # Use inlined asm for bar.sync to avoid instruction reordering
+                bar_sync_ptx(1, self.all_threads)
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                bar_sync_ptx(2, self.all_threads)
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run_hopper(x, red_buf)
+            )
+
+    return AllReduceInstance(reducer, threads, scale, thread_offset, all_threads)
diff --git a/tilelang/contrib/cutedsl/threadblock_swizzle.py b/tilelang/contrib/cutedsl/threadblock_swizzle.py
new file mode 100644
index 000000000..1ce78eb86
--- /dev/null
+++ b/tilelang/contrib/cutedsl/threadblock_swizzle.py
@@ -0,0 +1,54 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Constexpr
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class dim3:
+    x: int
+    y: int
+    z: int
+
+
+def ThreadIdx() -> dim3:
+    return dim3(*cute.arch.thread_idx())
+
+
+def BlockIdx() -> dim3:
+    return dim3(*cute.arch.block_idx())
+
+
+def GridDim() -> dim3:
+    return dim3(*cute.arch.grid_dim())
+
+
+@cute.jit
+def rasterization2DRow(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.x
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.x
+    col_idx = (gridDim.x - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    row_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
+
+
+@cute.jit
+def rasterization2DColumn(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.y
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.y
+    row_idx = (gridDim.y - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    col_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index e61d80cee..d80f0fdbc 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping functions to bridge frameworks with DLPack support to TVM"""
+
 from tvm import runtime
 
 
@@ -45,12 +46,8 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
 
     def adapt_tensor(arg):
         if isinstance(arg, tensor_type):
-            if arg.dtype in {
-                    torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2,
-                    torch.float8_e5m2fnuz
-            }:
-                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(
-                    arg.shape, dtype=float8_dtype_map[arg.dtype])
+            if arg.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz}:
+                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(arg.shape, dtype=float8_dtype_map[arg.dtype])
             return runtime.from_dlpack(to_dlpack_func(arg))
         return arg
 
@@ -59,23 +56,3 @@ def _wrapper(*args):
         return tvm_func(*args)
 
     return _wrapper
-
-
-def to_pytorch_func(tvm_func):
-    """Convert a tvm function into one that accepts PyTorch tensors
-
-    Parameters
-    ----------
-    tvm_func: Function
-        Built tvm function operating on arrays
-
-    Returns
-    -------
-    wrapped_func: Function
-        Wrapped tvm function that operates on PyTorch tensors
-    """
-    # pylint: disable=import-outside-toplevel
-    import torch
-    import torch.utils.dlpack
-
-    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/tilelang/contrib/hipcc.py b/tilelang/contrib/hipcc.py
index 4e3c9a5c3..7b7f9f947 100644
--- a/tilelang/contrib/hipcc.py
+++ b/tilelang/contrib/hipcc.py
@@ -16,12 +16,7 @@
 from tvm.contrib.rocm import get_rocm_arch, find_rocm_path
 
 
-def compile_hip(code,
-                target_format="hsaco",
-                arch=None,
-                options=None,
-                path_target=None,
-                verbose=False):
+def compile_hip(code, target_format="hsaco", arch=None, options=None, path_target=None, verbose=False):
     """Compile HIP code with hipcc.
 
     Parameters
@@ -61,7 +56,7 @@ def compile_hip(code,
 
     file_target = path_target if path_target else temp_target
     cmd = ["hipcc"]
-    cmd += ["-O3", '-c']
+    cmd += ["-O3", "-c"]
     if isinstance(arch, str):
         cmd += [f"--offload-arch={arch}"]
     if target_format == "hsaco":
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 2903b15d4..21b99df0e 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -1,7 +1,7 @@
 # pylint: disable=invalid-name
 # modified from apache tvm python/tvm/contrib/nvcc.py
 """Utility to invoke nvcc compiler in the system"""
-from __future__ import absolute_import as _abs
+
 from __future__ import annotations
 
 import os
@@ -19,12 +19,7 @@
 from tvm.contrib import utils
 
 
-def compile_cuda(code,
-                 target_format="ptx",
-                 arch=None,
-                 options=None,
-                 path_target=None,
-                 verbose=False):
+def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target=None, verbose=False):
     """Compile cuda code with NVCC from env.
 
     Parameters
@@ -68,7 +63,7 @@ def compile_cuda(code,
     temp_target = temp.relpath(f"{file_name}.{target_format}")
 
     pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-    kernels_output_dir = (pass_context.config.get("cuda.kernels_output_dir", None))
+    kernels_output_dir = pass_context.config.get("cuda.kernels_output_dir", None)
     if kernels_output_dir is not None:
         if not os.path.isdir(kernels_output_dir):
             os.makedirs(kernels_output_dir)
@@ -79,10 +74,10 @@ def compile_cuda(code,
         out_file.write(code)
 
     file_target = path_target if path_target else temp_target
-    cmd = ["nvcc"]
+    cmd = [get_nvcc_compiler()]
     cmd += [f"--{target_format}", "-O3"]
-    if kernels_output_dir is not None:
-        cmd += ["-lineinfo"]
+    # Always include line info for better profiling and mapping
+    cmd += ["-lineinfo"]
     if isinstance(arch, list):
         cmd += arch
     elif isinstance(arch, str):
@@ -115,10 +110,7 @@ def compile_cuda(code,
         print(py_str(out))
 
     if proc.returncode != 0:
-        msg = f"{code}\n" \
-            f"Compilation error:\n" \
-            f"{py_str(out)}\n" \
-            f"Command: {' '.join(cmd)}\n"
+        msg = f"{code}\nCompilation error:\n{py_str(out)}\nCommand: {' '.join(cmd)}\n"
         raise RuntimeError(msg)
 
     with open(file_target, "rb") as f:
@@ -166,6 +158,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
     if compile_flags:
         import shlex
+
         for flag in compile_flags:
             # Split each string like a shell would, preserving quoted args
             tokens = shlex.split(flag) if isinstance(flag, str) else [str(flag)]
@@ -173,9 +166,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     return options
 
 
-def get_ptx_from_source(code: str,
-                        compile_flags: list[str] | None = None,
-                        verbose: bool = False) -> str:
+def get_ptx_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to PTX using NVCC and return as text.
 
@@ -213,9 +204,7 @@ def _find_tool(name: str) -> str | None:
     return None
 
 
-def get_sass_from_source(code: str,
-                         compile_flags: list[str] | None = None,
-                         verbose: bool = False) -> str:
+def get_sass_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to CUBIN and disassemble to SASS.
 
@@ -247,9 +236,7 @@ def get_sass_from_source(code: str,
     cand_nvdisasm = _find_tool("nvdisasm")
     cand_cuobjdump = _find_tool("cuobjdump")
     if not cand_nvdisasm and not cand_cuobjdump:
-        raise RuntimeError(
-            "Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH."
-        )
+        raise RuntimeError("Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH.")
     last_err: str | None = None
     try:
         # Attempt nvdisasm first
@@ -269,8 +256,7 @@ def get_sass_from_source(code: str,
                 return text
             last_err = f"{tool_name} rc={proc.returncode}, output:\n{text}"
         # If we reach here, all attempts failed
-        raise RuntimeError(f"SASS disassembly failed. Tried tools: "
-                           f"{', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
+        raise RuntimeError(f"SASS disassembly failed. Tried tools: {', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
     finally:
         with contextlib.suppress(Exception):
             os.remove(cubin_path)
@@ -333,13 +319,6 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
-    """use nvcc to generate fatbin code for better optimization"""
-    ptx = compile_cuda(code, target_format="fatbin")
-    return ptx
-
-
 @tvm_ffi.register_global_func("tilelang_callback_libdevice_path", override=True)
 def find_libdevice_path(arch):
     """Utility function to find libdevice
@@ -426,28 +405,23 @@ def get_target_compute_version(target=None):
     # 2. Target.current()
     target = target or Target.current()
     if target and target.arch:
-        arch = target.arch.split("_")[1]
+        arch = target.arch.split("_")[1].rstrip("af")
         if len(arch) == 2:
             major, minor = arch
             # Handle old format like sm_89
             return major + "." + minor
         elif len(arch) == 3:
-            major = int(arch[0])
-            if major < 2:
-                major = arch[0:2]
-                minor = arch[2]
-                return major + "." + minor
-            else:
-                # This is for arch like "sm_90a"
-                major, minor, suffix = arch
-            return major + "." + minor + "." + suffix
+            major = arch[0:2]
+            minor = arch[2]
+            return major + "." + minor
+        else:
+            raise ValueError(f"Unsupported arch: {arch}")
 
     # 3. GPU compute version
     if tvm.cuda(0).exist:
         return tvm.cuda(0).compute_version
 
-    raise ValueError("No CUDA architecture was specified or GPU detected."
-                     "Try specifying it by adding '-arch=sm_xx' to your target.")
+    raise ValueError("No CUDA architecture was specified or GPU detected.Try specifying it by adding '-arch=sm_xx' to your target.")
 
 
 def parse_compute_version(compute_version) -> tuple[int, int]:
@@ -475,8 +449,11 @@ def parse_compute_version(compute_version) -> tuple[int, int]:
         raise RuntimeError("Compute version parsing error") from err
 
 
-def get_target_arch(compute_version) -> str:
-    major, minor = parse_compute_version(compute_version)
+def get_target_arch(compute_version: str | tuple[int, int]) -> str:
+    if isinstance(compute_version, str):
+        major, minor = parse_compute_version(compute_version)
+    else:
+        major, minor = compute_version
     target_arch = str(major * 10 + minor)
     if major >= 9:
         target_arch += "a"
@@ -532,7 +509,8 @@ def have_tensorcore(compute_version=None, target=None):
                 warnings.warn(
                     "Tensorcore will be disabled due to no CUDA architecture specified."
                     "Try specifying it by adding '-arch=sm_xx' to your target.",
-                    stacklevel=2)
+                    stacklevel=2,
+                )
                 return False
             compute_version = target.attrs["arch"]
             # Compute version will be in the form "sm_{major}{minor}"
@@ -608,6 +586,14 @@ def is_hopper(target):
     return major == 9 and minor == 0
 
 
+def have_pdl(target):
+    if target.kind.name != "cuda":
+        return False
+    compute_version = get_target_compute_version(target)
+    major, minor = parse_compute_version(compute_version)
+    return major >= 9
+
+
 def get_nvcc_compiler() -> str:
     """Get the path to the nvcc compiler"""
     return os.path.join(find_cuda_path(), "bin", "nvcc")
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index b69115549..c3f083dc9 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -11,11 +11,13 @@ def get_nvrtc_version() -> tuple[int, int]:
     return (major, minor)
 
 
-def compile_cuda(code: str,
-                 target_format: Literal["ptx", "cubin"] = "ptx",
-                 arch: int | None = None,
-                 options: str | list[str] | None = None,
-                 verbose: bool = False) -> bytearray:
+def compile_cuda(
+    code: str,
+    target_format: Literal["ptx", "cubin"] = "ptx",
+    arch: int | None = None,
+    options: str | list[str] | None = None,
+    verbose: bool = False,
+) -> bytearray:
     """Compile cuda code with NVRTC.
 
     Parameters
@@ -43,8 +45,7 @@ def compile_cuda(code: str,
     if arch is None:
         # If None, then it will use `tvm.target.Target.current().arch`.
         # Target arch could be a str like "80", "90", "90a", etc.
-        major, minor = parse_compute_version(
-            get_target_compute_version(Target.current(allow_none=True)))
+        major, minor = parse_compute_version(get_target_compute_version(Target.current(allow_none=True)))
         arch = major * 10 + minor
     prefix = "compute" if target_format == "ptx" else "sm"
     suffix = "a" if arch >= 90 else ""
@@ -69,6 +70,10 @@ def compile_cuda(code: str,
             raise ValueError("options must be str or list of str")
 
     code = "#include <tl_templates/cuda/nvrtc_std.h>\n" + code
+
+    if "cudaGridDependencySynchronize" in code or "cudaTriggerProgrammaticLaunchCompletion" in code:
+        code = '#include "cuda_device_runtime_api.h"\n' + code
+
     code_bytes = bytes(code, "utf-8")
     result, program = nvrtc.nvrtcCreateProgram(code_bytes, bytes(file_name, "utf-8"), 0, [], [])
     assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to create program: {result}"
@@ -77,8 +82,7 @@ def compile_cuda(code: str,
     compile_result = nvrtc.nvrtcCompileProgram(program, len(options_bytes), options_bytes)[0]
 
     if compile_result != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        msg = f"{code}\n" \
-            f"Compilation error:\n"
+        msg = f"{code}\nCompilation error:\n"
         if verbose:
             result, log_size = nvrtc.nvrtcGetProgramLogSize(program)
             assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get program log size: {result}"
@@ -105,7 +109,6 @@ def compile_cuda(code: str,
         assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get PTX: {result}"
 
     # Destroy handler
-    assert nvrtc.nvrtcDestroyProgram(
-        program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
+    assert nvrtc.nvrtcDestroyProgram(program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
 
     return result_bytes
diff --git a/tilelang/contrib/rocm.py b/tilelang/contrib/rocm.py
index 4a57c3c64..f3b92e54d 100644
--- a/tilelang/contrib/rocm.py
+++ b/tilelang/contrib/rocm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility for ROCm backend"""
+
 # ruff: noqa
 import re
 import subprocess
@@ -255,9 +256,11 @@ def get_rocm_arch(rocm_path="/opt/rocm"):
             gpu_arch = match.group(1)
         return gpu_arch
     except subprocess.CalledProcessError:
-        print(f"Unable to execute rocminfo command, \
+        print(
+            f"Unable to execute rocminfo command, \
                 please ensure ROCm is installed and you have an AMD GPU on your system.\
-                    using default {gpu_arch}.")
+                    using default {gpu_arch}."
+        )
         return gpu_arch
 
 
diff --git a/tilelang/dtypes.py b/tilelang/dtypes.py
new file mode 100644
index 000000000..3e12d285d
--- /dev/null
+++ b/tilelang/dtypes.py
@@ -0,0 +1,3 @@
+# Re-export from language.dtypes for convenient access via `from tilelang.dtypes import ...`
+from tilelang.language.dtypes import *  # noqa: F401, F403
+from tilelang.language.dtypes import dtype, AnyDType, get_tvm_dtype  # noqa: F401
diff --git a/tilelang/engine/__init__.py b/tilelang/engine/__init__.py
index 476b40a35..b7cd7eb23 100644
--- a/tilelang/engine/__init__.py
+++ b/tilelang/engine/__init__.py
@@ -1,3 +1,7 @@
 from .lower import lower, is_device_call  # noqa: F401
 from .param import KernelParam  # noqa: F401
-from .callback import register_cuda_postproc, register_hip_postproc  # noqa: F401
+from .callback import (
+    register_cuda_postproc,  # noqa: F401
+    register_hip_postproc,  # noqa: F401
+    register_c_postproc,  # noqa: F401
+)
diff --git a/tilelang/engine/callback.py b/tilelang/engine/callback.py
index 05fafe9db..cc211498f 100644
--- a/tilelang/engine/callback.py
+++ b/tilelang/engine/callback.py
@@ -26,6 +26,32 @@ def register_hip_postproc(func: Callable[[str, Target], str], override: bool = T
     tvm_ffi.register_global_func("tilelang_callback_hip_postproc", f=func, override=override)
 
 
+def register_c_postproc(func: Callable[[str, Target], str], override: bool = True):
+    """Register a post-processing function for C host code generation.
+
+    This callback intercepts C host code emitted by TileLang just before it
+    is wrapped into a CSourceModule. It should take the generated code string
+    and the `Target` as inputs, and return the (possibly) modified code.
+
+    Args:
+        func: A callable that takes generated code (str) and target (Target) as input,
+              and returns the processed code (str).
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    tvm_ffi.register_global_func("tilelang_callback_c_host_postproc", f=func, override=override)
+
+
+def register_metal_postproc(func: Callable[[str, Target], str], override: bool = True):
+    """Register a post-processing function for Metal code generation.
+
+    Args:
+        func: A callable that takes generated code (str) and target (Target) as input,
+             and returns the processed code (str).
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    tvm_ffi.register_global_func("tvm_callback_metal_compile", f=func, override=override)
+
+
 def register_cuda_postproc_callback(func: Callable | bool = None, override: bool = True):
     """Decorator for registering CUDA post-processing callback function.
 
@@ -90,3 +116,69 @@ def _register(fn: Callable[[str, Target], str]):
         return _register
 
     raise TypeError("Invalid decorator usage")
+
+
+def register_c_postproc_callback(func: Callable | bool = None, override: bool = True):
+    """Decorator for registering C host post-processing callback function.
+
+    Can be used with or without parentheses:
+        @register_c_postproc_callback
+        def func(code, target): ...
+
+        @register_c_postproc_callback()
+        def func(code, target): ...
+
+        @register_c_postproc_callback(override=False)
+        def func(code, target): ...
+
+    Args:
+        func: The function to be decorated or a boolean override flag
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    if callable(func):
+        register_c_postproc(func, override)
+        return func
+
+    if func is None or isinstance(func, bool):
+        _override = func if isinstance(func, bool) else override
+
+        def _register(fn: Callable[[str, Target], str]):
+            register_c_postproc(fn, _override)
+            return fn
+
+        return _register
+
+    raise TypeError("Invalid decorator usage")
+
+
+def register_metal_postproc_callback(func: Callable | bool = None, override: bool = True):
+    """Decorator for registering Metal post-processing callback function.
+
+    Can be used with or without parentheses:
+        @register_metal_postproc_callback
+        def func(code, target): ...
+
+        @register_metal_postproc_callback()
+        def func(code, target): ...
+
+        @register_metal_postproc_callback(override=False)
+        def func(code, target): ...
+
+    Args:
+        func: The function to be decorated or a boolean override flag
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    if callable(func):
+        register_metal_postproc(func, override)
+        return func
+
+    if func is None or isinstance(func, bool):
+        _override = func if isinstance(func, bool) else override
+
+        def _register(fn: Callable[[str, Target], str]):
+            register_metal_postproc(fn, _override)
+            return fn
+
+        return _register
+
+    raise TypeError("Invalid decorator usage")
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index d0c27b4c2..2aa18806c 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -1,9 +1,9 @@
 """The compiler for TL programs."""
+
 from __future__ import annotations
 
-import os
-import os.path as osp
 from typing import Callable
+import sys
 import tilelang.transform
 from tilelang import tvm as tvm
 from tvm import tir
@@ -11,9 +11,13 @@
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+from tilelang.transform import PassConfigKey
+from tilelang.transform.metal import MarkHostMetalContext
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
+    PreLowerSemanticCheck,
     LowerAndLegalize,
     OptimizeForTarget,
 )
@@ -25,14 +29,13 @@ def is_cpu_device_backend(target: Target):
 
 def has_device_kernel_launch(attrs) -> bool:
     """Check if the attributes indicate a device kernel launch."""
-    return bool(attrs and "calling_conv" in attrs and
-                attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
+    return bool(attrs and "calling_conv" in attrs and attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
 
 
 def is_device_call_c_device(func: tir.PrimFunc):
     attrs = func.attrs
     calling_conv = attrs.get("calling_conv", CallingConv.DEFAULT)
-    is_cpacked = (calling_conv == CallingConv.C_PACKED_FUNC)
+    is_cpacked = calling_conv == CallingConv.C_PACKED_FUNC
 
     # Check if it's a C target
     if "target" in attrs and attrs["target"].kind.name == "c" and not is_cpacked:
@@ -54,37 +57,56 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
 
 
 @tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    if "TL_TEMPLATE_PATH" in os.environ:
-        tl_template_path = os.environ["TL_TEMPLATE_PATH"]
-    else:
-        tl_template_path = osp.abspath(osp.join(project_root, "src"))
-    # TODO(lei): this indeed should be renamed into
-    # TL_CUTLASS_INCLUDE_PATH in the future
-    if "TL_CUTLASS_PATH" in os.environ:
-        cutlass_path = os.environ["TL_CUTLASS_PATH"]
-    else:
-        cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
+def tilelang_callback_cuda_compile(code, target, pass_config=None):
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
-    format = "cubin"
+    compile_format = "cubin"
+
+    # Read pass-config keys (string-valued) like in jit.adapter.libgen.compile_lib
+    cfg = pass_config or {}
+    enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH, False))
+
+    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False))
+
+    options = [
+        "-std=c++17",
+        "-I" + TILELANG_TEMPLATE_PATH,
+        "-I" + CUTLASS_INCLUDE_DIR,
+    ]
+    # Merge extra device compiler flags from pass config, if provided
+    extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
+    if extra_flags:
+        import shlex
+
+        if isinstance(extra_flags, str):
+            tokens = shlex.split(extra_flags)
+        else:
+            tokens = []
+            for flag in extra_flags:
+                if isinstance(flag, str):
+                    tokens.extend(shlex.split(flag))
+                else:
+                    tokens.append(str(flag))
+        options += tokens
+
+    verbose = False
+    if enable_fast_math:
+        options.append("--use_fast_math")
+    if ptxas_usage_level is not None:
+        options.append(f"--ptxas-options=--register-usage-level={ptxas_usage_level}")
+    if verbose_ptxas_output:
+        options.append("--ptxas-options=--verbose")
+        options.append("-w")  # Suppress warnings to make ptxas output more readable
+        verbose = True
 
-    # printing out number of registers
-    debug_option = "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
     ptx = nvcc.compile_cuda(
         code,
-        format,
+        compile_format,
         arch,
-        options=[
-            "-std=c++17",
-            debug_option,
-            "--use_fast_math",
-            "-I" + tl_template_path,
-            "-I" + cutlass_path,
-        ],
-        verbose=False,
+        options=options,
+        verbose=verbose,
     )
 
     return ptx
@@ -92,23 +114,13 @@ def tilelang_callback_cuda_compile(code, target):
 
 @tvm_ffi.register_global_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    tl_template_path = osp.abspath(osp.join(project_root, "src"))
-
-    # TODO(lei): actually this indeed should be renamed into
-    # TL_COMPOSABLE_KERNEL_INCLUDE_PATH in the future
-    if "TL_COMPOSABLE_KERNEL_PATH" in os.environ:
-        ck_path = os.environ["TL_COMPOSABLE_KERNEL_PATH"]
-    else:
-        ck_path = osp.abspath(osp.join(project_root, "3rdparty/composable_kernel/include"))
-
     hsaco = hipcc.compile_hip(
         code,
         target_format="hsaco",
         options=[
             "-std=c++17",
-            "-I" + tl_template_path,
-            "-I" + ck_path,
+            "-I" + TILELANG_TEMPLATE_PATH,
+            "-I" + COMPOSABLE_KERNEL_INCLUDE_DIR,
         ],
         verbose=False,
     )
@@ -127,7 +139,6 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
 
 
 def canon_target_host(target: str | Target, target_host: str | Target | None):
-
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "c"
 
@@ -143,10 +154,12 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     host_mod = tilelang.transform.LowerIntrin()(host_mod)
     host_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(host_mod)
     host_mod = tir.transform.CombineContextCall()(host_mod)
+    if sys.platform == "darwin":
+        host_mod = MarkHostMetalContext()(host_mod)
     if target_host.kind.name == "llvm":
         host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm.ffi.get_global_func("target.build.c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
@@ -156,11 +169,15 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(device_mod)
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
+    device_mod = tilelang.transform.HoistBroadcastValues()(device_mod)
 
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda")
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
+    elif target.kind.name == "metal":
+        device_mod = tvm.ffi.get_global_func("target.build.metal")(device_mod, target)
     else:
         raise ValueError(f"Target {target.kind.name} is not supported")
 
@@ -171,12 +188,13 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(device_mod)
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
+    device_mod = tilelang.transform.HoistBroadcastValues()(device_mod)
+
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
-            device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda") + "_without_compile"
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(
-            device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
     elif target.kind.name == "llvm":
@@ -199,12 +217,12 @@ def lower(
     enable_host_codegen=False,
     enable_device_compile=False,
 ) -> CompiledArtifact:
-    '''
-        enable_host_codegen: whether to enable host codegen, default is False, as we have our
-        own host codegen implementation in jit.
-        enable_device_compile: whether to enable device codegen, default is False, as we have our
-        own device codegen implementation in jit.
-    '''
+    """
+    enable_host_codegen: whether to enable host codegen, default is False, as we have our
+    own host codegen implementation in jit.
+    enable_device_compile: whether to enable device codegen, default is False, as we have our
+    own device codegen implementation in jit.
+    """
 
     mod = func_or_mod
     params = None
@@ -224,6 +242,9 @@ def lower(
     _is_host_call = get_host_call(is_device_c=is_cpu_device_backend(target))
     _is_device_call = get_device_call(is_device_c=is_cpu_device_backend(target))
 
+    # Before lowering, do semantic check
+    PreLowerSemanticCheck(mod)
+
     # Phase 1: Lower and legalize the IR
     mod = LowerAndLegalize(mod, target)
 
@@ -233,14 +254,11 @@ def lower(
     host_mod = tir.transform.Filter(_is_host_call)(mod)
     device_mod = tir.transform.Filter(_is_device_call)(mod)
 
-    codegen_mod = device_codegen(
-        device_mod, target) if enable_device_compile else device_codegen_without_compile(
-            device_mod, target)
+    codegen_mod = device_codegen(device_mod, target) if enable_device_compile else device_codegen_without_compile(device_mod, target)
 
     if enable_host_codegen:
         host_mod = host_codegen(host_mod, target_host)
         host_mod.import_module(codegen_mod)
-        return CompiledArtifact(
-            host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
+        return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
 
     return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source())
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index de3c979ea..98ef6f0e1 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -1,11 +1,12 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
 import torch
 from tilelang import tvm as tvm
 from tvm.tir import Buffer, IntImm, Var, PrimExpr
-from tilelang.utils.tensor import map_torch_type
+import tilelang.language as T
 
 
 @dataclass
@@ -14,7 +15,12 @@ class KernelParam:
     Represents parameters for a kernel operation, storing dtype and shape information.
     Used to describe tensor or scalar parameters in TVM/PyTorch interop.
     """
-    dtype: torch.dtype  # PyTorch data type of the parameter
+
+    # Use tvm.DataType (buffer.dtype) directly instead of torch.dtype to support more data types
+    # tvm.DataType can represent a much wider range of types than PyTorch's dtype system,
+    # including specialized types like float8_e4m3, float8_e5m2, custom quantized types, etc.
+    # This avoids information loss when converting from TVM buffer types
+    dtype: tvm.DataType  # Data type from buffer.dtype (supports all TVM types)
     shape: list[int | Var]  # List of dimensions, can be integers or TVM variables
 
     @classmethod
@@ -26,12 +32,14 @@ def from_buffer(cls, buffer: Buffer):
             buffer: TVM Buffer object containing dtype and shape information
 
         Returns:
-            KernelParam instance with converted dtype and shape
+            KernelParam instance with dtype directly from buffer and shape
 
         Raises:
             ValueError: If dimension type is not supported (not IntImm or Var)
         """
-        dtype = map_torch_type(buffer.dtype)
+        # Use buffer.dtype directly (tvm.DataType) to preserve all type information
+        # buffer.dtype is already a tvm.DataType object, no conversion needed
+        dtype = buffer.dtype
         shape = []
         for s in buffer.shape:
             if isinstance(s, IntImm):
@@ -54,7 +62,9 @@ def from_var(cls, var: Var):
         Returns:
             KernelParam instance representing a scalar (empty shape)
         """
-        dtype = map_torch_type(var.dtype)
+        # Use var.dtype directly (tvm.DataType) to preserve all type information
+        # var.dtype is already a tvm.DataType object, no conversion needed
+        dtype = var.dtype
         return cls(dtype, [])
 
     def is_scalar(self) -> bool:
@@ -90,6 +100,18 @@ def is_float8(self) -> bool:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("float8")
 
+    def is_float4(self) -> bool:
+        """
+        Checks if the parameter represents a float4 type.
+
+        Returns:
+            bool: True if parameter is a float4 type, False otherwise
+        """
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("float4")
+
     def is_boolean(self) -> bool:
         """
         Checks if the parameter represents a boolean type.
@@ -102,6 +124,31 @@ def is_boolean(self) -> bool:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("bool")
 
+    def torch_dtype(self) -> torch.dtype:
+        """
+        Converts the TVM DataType to PyTorch dtype.
+
+        This method is used when creating PyTorch tensors from KernelParam,
+        as PyTorch's tensor creation functions require torch.dtype.
+
+        Returns:
+            torch.dtype: Corresponding PyTorch dtype
+
+        Example:
+            >>> param = KernelParam.from_buffer(buffer)
+            >>> tensor = torch.empty(shape, dtype=param.torch_dtype())
+        """
+        return T.dtype(self.dtype).as_torch()
+
+    def tilelang_dtype(self) -> T.dtype:
+        """
+        Converts the TVM DataType to TileLang dtype.
+
+        Returns:
+            T.dtype: Corresponding TileLang dtype
+        """
+        return T.dtype(self.dtype)
+
 
 @dataclass
 class CompiledArtifact:
@@ -109,6 +156,7 @@ class CompiledArtifact:
     Represents a compiled kernel artifact containing both host and device code.
     Stores all necessary components for kernel execution in the TVM runtime.
     """
+
     host_mod: tvm.IRModule  # Host-side TVM IR module for managing kernel execution
     device_mod: tvm.IRModule  # Device-side TVM IR module containing the actual kernel code
     params: list[KernelParam]  # List of parameters (tensors/scalars) used by the kernel
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 26a0bea37..5821dabeb 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -3,11 +3,10 @@
 from tvm.target import Target
 import tilelang
 from tilelang.transform import PassContext
-from tilelang.contrib.nvcc import have_tma, is_hopper
+from tilelang.contrib.nvcc import have_tma, is_hopper, have_pdl
 
 
-def allow_warp_specialized(pass_ctx: PassContext | None = None,
-                           target: Target | None = None) -> bool:
+def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     # avoid circular import
     from tilelang.jit.adapter.utils import is_cuda_target
 
@@ -19,8 +18,7 @@ def allow_warp_specialized(pass_ctx: PassContext | None = None,
     return not disable_warp_specialized
 
 
-def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
     if not have_tma(target):
@@ -47,12 +45,10 @@ def allow_global_thread_synchronization(pass_ctx: PassContext | None = None) ->
     return enable_global_thread_sync
 
 
-def should_enable_aggressive_merge(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def should_enable_aggressive_merge(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
-    enable_aggressive_merge = bool(
-        pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
+    enable_aggressive_merge = bool(pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
     if allow_warp_specialized(pass_ctx=pass_ctx, target=target):
         # This is a workaround to avoid the bug in the MergeSharedMemoryAllocations pass
         # when warp specialization is enabled, as different warp threads may access different
@@ -67,6 +63,77 @@ def should_force_let_inline(pass_ctx: PassContext | None = None) -> bool:
     return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_FORCE_LET_INLINE, False))
 
 
+def should_enable_ast_print(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_AST_PRINT_ENABLE, False))
+
+
+def should_enable_layout_visual(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE, False)
+    return enabled
+
+
+def should_enable_race_check(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = not pass_ctx.config.get(tilelang.PassConfigKey.TL_DISABLE_DATA_RACE_CHECK, False)
+    return enabled
+
+
+def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    formats_value = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS, "")
+    if not formats_value:
+        return ["txt"]
+
+    formats_str = formats_value.strip().lower()
+    valid_formats = ["txt", "png", "pdf", "svg", "all"]
+
+    if formats_str == "all":
+        return ["txt", "png", "pdf", "svg"]
+
+    if "," in formats_str:
+        formats_list = [f.strip() for f in formats_str.split(",")]
+    else:
+        formats_list = [formats_str]
+
+    invalid_formats = [f for f in formats_list if f not in valid_formats]
+    if invalid_formats:
+        raise ValueError(
+            f"Invalid formats for TL_LAYOUT_VISUALIZATION_FORMATS: {invalid_formats}. "
+            f"Valid formats are: {valid_formats}. "
+            f"You can choose one of the valid formats or a comma-separated list of formats.(e.g., 'txt,png,pdf')"
+        )
+    return formats_list
+
+
+def LayoutVisual(mod: IRModule) -> None:
+    """Apply layout visualization pass if enabled."""
+    if should_enable_layout_visual():
+        formats = get_layout_visual_formats()
+        tilelang.analysis.LayoutVisual(formats=formats)(mod)
+
+
+def PreLowerSemanticCheck(mod: IRModule) -> None:
+    """
+    Check whether the module is valid before lowering. If not, raise a user-friendly error
+    in Python side instead of letting the error dive into the complicated TVM/C++ stack.
+    Note: This is a validation-only pipeline of passes and does not modify or return the module.
+    """
+
+    # Print AST for debugging purpose
+    if should_enable_ast_print():
+        tilelang.analysis.ASTPrinter()(mod)
+    # Check if there are any invalid nested loops.
+    tilelang.analysis.NestedLoopChecker()(mod)
+    # Check if there are any invalid symbolic T.Parallel + fragment access.
+    tilelang.analysis.FragmentLoopChecker()(mod)
+
+
 def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # Bind the target device information to the module
     """
@@ -98,6 +165,9 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.AddWrapperForSingleBufStore()(mod)
     # Normalize negative indices to canonical non-negative form
     mod = tilelang.transform.LegalizeNegativeIndex()(mod)
+    # Verify parallel loop correctness
+    if should_enable_race_check():
+        mod = tilelang.transform.VerifyParallelLoop()(mod)
     # Inject assumes to speedup tvm prover
     mod = tilelang.transform.InjectAssumes()(mod)
     # Simplify the IR expressions
@@ -106,10 +176,14 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LayoutReducer()(mod)
     # Infer memory layouts for fragments and shared memory
     mod = tilelang.transform.LayoutInference()(mod)
+    # Visualize the layout
+    LayoutVisual(mod)
     # Lower high-level tile operations to low-level operations
     mod = tilelang.transform.LowerTileOp()(mod)
     # Lower l2 persistent map
     mod = tilelang.transform.LowerL2Persistent()(mod)
+    # Decouple type cast vectorization constraints before vectorization
+    mod = tilelang.transform.DecoupleTypeCast()(mod)
     # Legalize vectorized loops to ensure they are valid
     mod = tilelang.transform.LegalizeVectorizedLoop()(mod)
     # Add safety checks for memory accesses
@@ -120,14 +194,14 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # TODO(lei): return to tir pass when kSymbolicBound simplification
     # is merged into tvm.
     mod = tilelang.transform.Simplify()(mod)
-    # Try to vectorize loop with dynamic shape
-    mod = tilelang.transform.LoopVectorizeDynamic()(mod)
+    # Hoist any root-block annotations to PrimFunc attrs if pass is available
+    mod = tilelang.transform.HoistNonRestrictParams()(mod)
     return mod
 
 
 def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     pass_ctx = tilelang.transform.get_pass_context()
-    # Lower the barrier.arrive into specific initialization slot
+    # Lower the shared.barrier into specific initialization slot
     mod = tilelang.transform.LowerSharedBarrier()(mod)
     # Lower the shared.tmem into specific initialization slot
     mod = tilelang.transform.LowerSharedTmem()(mod)
@@ -144,22 +218,16 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         # warp_specialized pass will pack the if stmt into the block
         # so we need to lower the opaque block first
         mod = tilelang.transform.LowerOpaqueBlock()(mod)
-        mod = tilelang.transform.MergeIfStmt()(mod)
         if is_hopper(target):
             mod = tilelang.transform.RewriteWgmmaSync()(mod)
-        mod = tilelang.transform.InjectFenceProxy()(mod)
     else:
         mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+        mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
         mod = tilelang.transform.PipelinePlanning()(mod)
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
-        mod = tilelang.transform.MergeIfStmt()(mod)
-        if allow_fence_proxy(target=target):
-            # in hopper device, wgmma is an async proxy
-            # so we need to inject a fence proxy before it
-            mod = tilelang.transform.InjectFenceProxy()(mod)
 
     mod = tilelang.transform.LowerOpaqueBlock()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
     mod = tilelang.transform.FlattenBuffer()(mod)
     # ConfigIndexBitwidth must be applied after FlattenBuffer
@@ -168,11 +236,11 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     mod = tir.transform.Simplify()(mod)
     mod = tilelang.transform.VectorizeLoop(enable_vectorize=allow_vectorize(pass_ctx=pass_ctx))(mod)
     mod = tilelang.transform.StorageRewrite()(mod)
-    mod = tir.transform.UnrollLoop()(mod)
+    mod = tilelang.transform.LoopUnswitching()(mod)
+    mod = tilelang.transform.UnrollLoop()(mod)
     mod = tir.transform.RenormalizeSplitPattern()(mod)
     mod = tir.transform.Simplify()(mod)
     mod = tir.transform.RemoveNoOp()(mod)
-    mod = tir.transform.RewriteUnsafeSelect()(mod)
     mod = tir.transform.HoistIfThenElse()(mod)
 
     mod = tir.transform.VerifyMemory()(mod)
@@ -188,7 +256,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     # the Legalization.
     mod = tir.transform.InferFragment()(mod)
     mod = tilelang.transform.LowerThreadAllreduce()(mod)
-
+    mod = tilelang.transform.LowerLDGSTG()(mod)
     mod = tilelang.transform.LowerHopperIntrin()(mod)
     # Global Barrier Synchronization must be applied before
     # SplitHostDevice pass, as the global barrier
@@ -196,20 +264,32 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.ThreadSync("global")(mod)
     mod = tilelang.transform.AnnotateDeviceRegions()(mod)
     mod = tilelang.transform.SplitHostDevice()(mod)
+
+    # Mark the function contains pdl_sync or pdl_trigger
+    mod = tilelang.transform.MarkCudaSyncCalls(have_pdl(target))(mod)
+
+    mod = tilelang.transform.AnnotateReadOnlyParams()(mod)
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
-    mod = tilelang.transform.MergeSharedMemoryAllocations(
-        enable_aggressive_merge=enable_aggressive_merge)(
-            mod)
+    mod = tilelang.transform.MergeSharedMemoryAllocations(enable_aggressive_merge=enable_aggressive_merge)(mod)
+    if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
+        mod = tilelang.transform.InjectFenceProxy()(mod)
+    else:
+        if allow_fence_proxy(target=target):
+            # in hopper device, wgmma is an async proxy
+            # so we need to inject a fence proxy before it
+            mod = tilelang.transform.InjectFenceProxy()(mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
+    mod = tilelang.transform.MergeIfStmt()(mod)
     # Inject PTX async copy must behind the thread sync pass
     # as ptx async copy won't be recognized as a valid buffer load
     mod = tilelang.transform.InjectPTXAsyncCopy()(mod)
     if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
         mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tilelang.transform.MakePackedAPI()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tilelang.transform.LowerDeviceKernelLaunch()(mod)
 
     # Transform threadblock to persistent threadblock
diff --git a/tilelang/env.py b/tilelang/env.py
index b98bbf989..2b4868420 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import importlib.metadata
 import sys
 import os
 import pathlib
@@ -10,45 +11,69 @@
 logger = logging.getLogger(__name__)
 
 # SETUP ENVIRONMENT VARIABLES
-CUTLASS_NOT_FOUND_MESSAGE = ("CUTLASS is not installed or found in the expected path")
+CUTLASS_NOT_FOUND_MESSAGE = "CUTLASS is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = (
-    "Composable Kernel is not installed or found in the expected path")
+COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = "Composable Kernel is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expected path")
+TL_TEMPLATE_NOT_FOUND_MESSAGE = "TileLang is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TVM_LIBRARY_NOT_FOUND_MESSAGE = ("TVM is not installed or found in the expected path")
+TVM_LIBRARY_NOT_FOUND_MESSAGE = "TVM is not installed or found in the expected path"
 
 TL_ROOT = os.path.dirname(os.path.abspath(__file__))
-TL_LIBS = [TL_ROOT, os.path.join(TL_ROOT, 'lib')]
+# Only expose the internal lib directory to sys.path to avoid shadowing
+# common top-level module names (e.g., utils, analysis) from user projects.
+TL_LIBS = [os.path.join(TL_ROOT, "lib")]
 TL_LIBS = [i for i in TL_LIBS if os.path.exists(i)]
 
 DEV = False
-THIRD_PARTY_ROOT = os.path.join(TL_ROOT, '3rdparty')
+THIRD_PARTY_ROOT = os.path.join(TL_ROOT, "3rdparty")
 if not os.path.exists(THIRD_PARTY_ROOT):
     DEV = True
     tl_dev_root = os.path.dirname(TL_ROOT)
 
-    dev_lib_root = os.path.join(tl_dev_root, 'build')
-    TL_LIBS = [dev_lib_root, os.path.join(dev_lib_root, 'tvm')]
-    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, '3rdparty')
-    logger.warning(f'Loading tilelang libs from dev root: {dev_lib_root}')
+    dev_lib_root = os.path.join(tl_dev_root, "build")
+    # In dev builds, place artifacts under build/lib and point search path there
+    # to avoid adding the entire build root to sys.path.
+    TL_LIBS = [os.path.join(dev_lib_root, "lib"), os.path.join(dev_lib_root, "tvm")]
+    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, "3rdparty")
+    logger.warning(f"Loading tilelang libs from dev root: {dev_lib_root}")
 
-assert TL_LIBS and all(
-    os.path.exists(i) for i in TL_LIBS), f'tilelang lib root do not exists: {TL_LIBS}'
+assert TL_LIBS and all(os.path.exists(i) for i in TL_LIBS), f"tilelang lib root do not exists: {TL_LIBS}"
 
 for lib in TL_LIBS:
     if lib not in sys.path:
         sys.path.insert(0, lib)
 
 
+def _get_package_version(pkg: str) -> str | None:
+    try:
+        return importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        return None
+
+
+def _is_running_autodd() -> bool:
+    """Detect if we are running under `python -m tilelang.autodd`."""
+    orig_argv = getattr(sys, "orig_argv", None)
+    if orig_argv is None:
+        return False
+    if "-mtilelang.autodd" in orig_argv:
+        return True
+    pos = orig_argv.index("-m") if "-m" in orig_argv else -1
+    if pos != -1 and pos + 1 < len(orig_argv):
+        module_name = orig_argv[pos + 1]
+        if module_name == "tilelang.autodd" or module_name.startswith("tilelang.autodd."):
+            return True
+    return False
+
+
 def _find_cuda_home() -> str:
     """Find the CUDA install path.
 
     Adapted from https://github.com/pytorch/pytorch/blob/main/torch/utils/cpp_extension.py
     """
     # Guess #1
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
     if cuda_home is None:
         # Guess #2
         nvcc_path = shutil.which("nvcc")
@@ -64,34 +89,45 @@ def _find_cuda_home() -> str:
             else:
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
 
-        else:
+        elif _get_package_version("nvidia-cuda-nvcc") is not None:
             # Guess #3
-            if sys.platform == 'win32':
-                cuda_homes = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
-                cuda_home = '' if len(cuda_homes) == 0 else cuda_homes[0]
+            # from pypi package nvidia-cuda-nvcc, only nvidia-cuda-nvcc>=13.0 works.
+            # nvidia-cuda-nvcc-cu12, etc. only installs `ptxas`, not `nvcc`
+            for file in importlib.metadata.files("nvidia-cuda-nvcc") or []:
+                if file.name == "nvcc" or file.name == "nvcc.exe":
+                    cuda_home = str(pathlib.Path(file.locate()).parent.parent)
+                    break
+            else:
+                raise AssertionError("`nvidia-cuda-nvcc` installed but no `nvcc` found")
+
+        else:
+            # Guess #4
+            if sys.platform == "win32":
+                cuda_homes = glob.glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
+                cuda_home = "" if len(cuda_homes) == 0 else cuda_homes[0]
             else:
                 # Linux/macOS
-                if os.path.exists('/usr/local/cuda'):
-                    cuda_home = '/usr/local/cuda'
-                elif os.path.exists('/opt/nvidia/hpc_sdk/Linux_x86_64'):
-                    cuda_home = '/opt/nvidia/hpc_sdk/Linux_x86_64'
+                if os.path.exists("/usr/local/cuda"):
+                    cuda_home = "/usr/local/cuda"
+                elif os.path.exists("/opt/nvidia/hpc_sdk/Linux_x86_64"):
+                    cuda_home = "/opt/nvidia/hpc_sdk/Linux_x86_64"
 
-            # Validate found path
-            if cuda_home is None or not os.path.exists(cuda_home):
-                cuda_home = None
+        # Validate found path
+        if cuda_home is None or not os.path.exists(cuda_home):
+            cuda_home = None
 
     return cuda_home if cuda_home is not None else ""
 
 
 def _find_rocm_home() -> str:
     """Find the ROCM install path."""
-    rocm_home = os.environ.get('ROCM_PATH') or os.environ.get('ROCM_HOME')
+    rocm_home = os.environ.get("ROCM_PATH") or os.environ.get("ROCM_HOME")
     if rocm_home is None:
         rocmcc_path = shutil.which("hipcc")
         if rocmcc_path is not None:
             rocm_home = os.path.dirname(os.path.dirname(rocmcc_path))
         else:
-            rocm_home = '/opt/rocm'
+            rocm_home = "/opt/rocm"
             if not os.path.exists(rocm_home):
                 rocm_home = None
     return rocm_home if rocm_home is not None else ""
@@ -100,6 +136,7 @@ def _find_rocm_home() -> str:
 # Cache control
 class CacheState:
     """Class to manage global kernel caching state."""
+
     _enabled = True
 
     @classmethod
@@ -196,12 +233,6 @@ def __set__(self, instance, value):
         # os.environ[self.key] = value
 
 
-# Cache control API (wrap CacheState)
-enable_cache = CacheState.enable
-disable_cache = CacheState.disable
-is_cache_enabled = CacheState.is_enabled
-
-
 # Utility function for environment variables with defaults
 # Assuming EnvVar and CacheState are defined elsewhere
 class Environment:
@@ -232,21 +263,27 @@ class Environment:
     TILELANG_TMP_DIR = EnvVar("TILELANG_TMP_DIR", os.path.join(TILELANG_CACHE_DIR.get(), "tmp"))
 
     # Kernel Build options
-    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION",
-                                           "1")  # print kernel name on compile
-    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # clear cache automatically if set
+    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION", "1")  # print kernel name on compile
+    TILELANG_DISABLE_CACHE = EnvVar(
+        "TILELANG_DISABLE_CACHE", "0"
+    )  # disable kernel cache, usually for unit testing / debugging, high priority
+    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # DEPRECATED! clear cache automatically if set
 
     # Kernel selection options
     # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
     TILELANG_USE_GEMM_V1 = EnvVar("TILELANG_USE_GEMM_V1", "0")
 
     # Auto-tuning settings
-    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES",
-                                                "0.9")  # percent of CPUs used
-    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS",
-                                             "-1")  # -1 means auto
-    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT",
-                                                "-1")  # -1 means no limit
+    TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
+    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES", "0.9")  # percent of CPUs used
+    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS", "-1")  # -1 means auto
+    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT", "-1")  # -1 means no limit
+
+    # Compilation defaults (for jit, autotune, compile)
+    # These allow overriding default compilation parameters via environment variables
+    TILELANG_DEFAULT_TARGET = EnvVar("TILELANG_TARGET", "auto")
+    TILELANG_DEFAULT_EXECUTION_BACKEND = EnvVar("TILELANG_EXECUTION_BACKEND", "auto")
+    TILELANG_DEFAULT_VERBOSE = EnvVar("TILELANG_VERBOSE", "0")
 
     # TVM integration
     SKIP_LOADING_TILELANG_SO = EnvVar("SKIP_LOADING_TILELANG_SO", "0")
@@ -267,7 +304,7 @@ def _initialize_torch_cuda_arch_flags(self) -> None:
 
     # Cache control API (wrap CacheState)
     def is_cache_enabled(self) -> bool:
-        return CacheState.is_enabled()
+        return not self.is_cache_globally_disabled() and CacheState.is_enabled()
 
     def enable_cache(self) -> None:
         CacheState.enable()
@@ -275,6 +312,12 @@ def enable_cache(self) -> None:
     def disable_cache(self) -> None:
         CacheState.disable()
 
+    def is_cache_globally_disabled(self) -> bool:
+        return self.TILELANG_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
+    def is_autotune_cache_disabled(self) -> bool:
+        return self.TILELANG_AUTO_TUNING_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
     def is_print_on_compilation_enabled(self) -> bool:
         return self.TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on")
 
@@ -286,10 +329,38 @@ def use_gemm_v1(self) -> bool:
         """
         return str(self.TILELANG_USE_GEMM_V1).lower() in ("1", "true", "yes", "on")
 
+    def get_default_target(self) -> str:
+        """Get default compilation target from environment."""
+        return self.TILELANG_DEFAULT_TARGET
+
+    def get_default_execution_backend(self) -> str:
+        """Get default execution backend from environment."""
+        return self.TILELANG_DEFAULT_EXECUTION_BACKEND
+
+    def get_default_verbose(self) -> bool:
+        """Get default verbose flag from environment."""
+        return self.TILELANG_DEFAULT_VERBOSE.lower() in ("1", "true", "yes", "on")
+
+    def is_running_autodd(self) -> bool:
+        """Return True if we are running under `python -m tilelang.autodd`."""
+        # means we are running under `python -m tilelang.autodd`
+        return _is_running_autodd()
+
+    def is_light_import(self) -> bool:
+        """Return True if we are running in light import mode."""
+        # means we are running under `python -m tilelang.autodd` or some
+        # other scripts that only require the minimal environment variables.
+        return self.is_running_autodd()
+
 
 # Instantiate as a global configuration object
 env = Environment()
 
+# Cache control API (wrap env, which is managed by CacheState and Environment Variables jointly)
+enable_cache = env.enable_cache  # CacheState.enable
+disable_cache = env.disable_cache  # CacheState.disable
+is_cache_enabled = env.is_cache_enabled  # CacheState.is_enabled
+
 # Export CUDA_HOME and ROCM_HOME, both are static variables
 # after initialization.
 CUDA_HOME = env.CUDA_HOME
@@ -309,18 +380,18 @@ def prepend_pythonpath(path):
 if env.TVM_IMPORT_PYTHON_PATH is not None:
     prepend_pythonpath(env.TVM_IMPORT_PYTHON_PATH)
 else:
-    tvm_path = os.path.join(THIRD_PARTY_ROOT, 'tvm', 'python')
+    tvm_path = os.path.join(THIRD_PARTY_ROOT, "tvm", "python")
     assert os.path.exists(tvm_path), tvm_path
     if tvm_path not in sys.path:
         prepend_pythonpath(tvm_path)
         env.TVM_IMPORT_PYTHON_PATH = tvm_path
-
-    if os.environ.get("TVM_LIBRARY_PATH") is None:
-        os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
+# By default, the built TVM-related libraries are stored in TL_LIBS.
+if os.environ.get("TVM_LIBRARY_PATH") is None:
+    os.environ["TVM_LIBRARY_PATH"] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
 
 # Initialize CUTLASS paths
 if os.environ.get("TL_CUTLASS_PATH", None) is None:
-    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, 'cutlass', 'include')
+    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, "cutlass", "include")
     if os.path.exists(cutlass_inc_path):
         os.environ["TL_CUTLASS_PATH"] = env.CUTLASS_INCLUDE_DIR = cutlass_inc_path
     else:
@@ -328,7 +399,7 @@ def prepend_pythonpath(path):
 
 # Initialize COMPOSABLE_KERNEL paths
 if os.environ.get("TL_COMPOSABLE_KERNEL_PATH", None) is None:
-    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, 'composable_kernel', 'include')
+    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, "composable_kernel", "include")
     if os.path.exists(ck_inc_path):
         os.environ["TL_COMPOSABLE_KERNEL_PATH"] = env.COMPOSABLE_KERNEL_INCLUDE_DIR = ck_inc_path
     else:
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 183ba646f..d8af97988 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -1,11 +1,12 @@
 from tvm import DataType
 from tvm.runtime import convert
+from tvm.tir import const
 import tilelang.language as T
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
-    thread_id = (j * 16 + i)
-    return thread_id, convert(0)
+    thread_id = j * 16 + i
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
@@ -15,8 +16,8 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
-    thread_id = (i * 16 + j)
-    return thread_id, convert(0)
+    thread_id = i * 16 + j
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
@@ -27,7 +28,7 @@ def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_C(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -45,7 +46,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_A(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_A(i, j):
     thread_id = i + 16 * (j // 4)
-    local = (j % 4)
+    local = j % 4
     return thread_id, local
 
 
@@ -57,7 +58,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_B(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -87,7 +88,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_A(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_A(i, j):
     thread_id = i + 16 * (j // 8)
-    local = (j % 8)
+    local = j % 8
     return thread_id, local
 
 
@@ -99,7 +100,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_B(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_B(i, j):
     thread_id = j + (i // 8) * 16
-    local = (i % 8)
+    local = i % 8
     return thread_id, local
 
 
@@ -111,7 +112,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_A(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_A(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
@@ -123,7 +124,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_B(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_B(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 8829fae25..fa65b0044 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -2,14 +2,15 @@
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
-from .utils import (
-    mfma_store_index_map,)
+from .utils import mfma_store_index_map
 from typing import Literal, Callable
 
 from tilelang.utils import is_fragment
-from tilelang.utils.language import to_buffer_region
+from tilelang.language.utils import get_buffer_region_from_load
 from .mfma_layout import (
     shared_16x4_to_local_64x1_layout_A,
     shared_4x16_to_local_64x1_layout_B,
@@ -48,7 +49,9 @@ class MatrixCoreIntrinEmitter:
         "int32": "int32",
         "float8_e4m3": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e4m3fn": "e4m3fn",
         "float8_e4m3fnuz": "e4m3fnuz",
+        "float8_e5m2fnuz": "e5m2fnuz",
     }
 
     # k_pack represents the number of elements in a vectorized instruction
@@ -60,9 +63,9 @@ class MatrixCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -100,13 +103,13 @@ def __init__(
         self.warp_rows = warp_row_tiles // self.micro_size_x
         self.warp_cols = warp_col_tiles // self.micro_size_y
         self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
         self.num_elems_per_byte = num_elems_per_byte
         self.thread_var = thread_var
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", "int8"]:
+            if a_dtype in ["float8_e4m3fn", "float8_e4m3fnuz", "float8_e5m2", "float8_e5m2fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -131,12 +134,7 @@ def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
     def _initialize_mfma_prefix(self, k_dim=16):
         in_dtype, out_dtype = self.a_dtype, self.accum_dtype
         M_DIM, N_DIM = self.M_DIM, self.N_DIM
-        out_dtype_abbrv = {
-            "float16": "f16",
-            "float32": "f32",
-            "int8": "i8",
-            "int32": "i32"
-        }[out_dtype]
+        out_dtype_abbrv = {T.float16: "f16", T.float32: "f32", T.int8: "i8", T.int32: "i32"}[out_dtype]
 
         in_dtype_abbrv = {
             "bfloat16": "bf16",
@@ -144,11 +142,17 @@ def _initialize_mfma_prefix(self, k_dim=16):
             "float32": "f32",
             "int8": "i8",
             "int32": "i32",
+            "float8_e4m3fn": "fp8",
             "float8_e4m3fnuz": "fp8",
+            # ROCm treats E5M2 as BF8 in MFMA intrinsics.
+            "float8_e5m2": "bf8",
+            "float8_e5m2fnuz": "bf8",
         }[in_dtype]
 
         if in_dtype_abbrv == "fp8":
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_fp8_fp8"
+        elif in_dtype_abbrv == "bf8":
+            self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_bf8_bf8"
         elif in_dtype_abbrv == "i8":
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_i8"
         elif in_dtype_abbrv == "bf16":
@@ -175,36 +179,51 @@ def _initialize_b_preshuffle(self, b_preshuffle: bool | None = False):
             self.b_preshuffle = b_preshuffle
 
     def get_ldmatrix_index_map(self, is_b=False):
-
         k_dim = self.k_dim * self.k_pack
         transposed = self.a_transposed if not is_b else self.b_transposed
         if k_dim == 4:
-            index_map = shared_16x4_to_local_64x1_layout_A
-            reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A
+            index_map = shared_4x16_to_local_64x1_layout_B if transposed else shared_16x4_to_local_64x1_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x1_to_4x16_layout_B if transposed else thread_id_shared_access_64x1_to_16x4_layout_A
+            )
             if is_b:
                 index_map = shared_16x4_to_local_64x1_layout_A if transposed else shared_4x16_to_local_64x1_layout_B
-                reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                )
         elif k_dim == 16:
             index_map = shared_16x16_to_local_64x4_layout_B if transposed else shared_16x16_to_local_64x4_layout_A
-            reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x16_to_local_64x4_layout_A if transposed else shared_16x16_to_local_64x4_layout_B
-                reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                )
         elif k_dim == 32:
             index_map = shared_16x32_to_local_64x8_layout_B if transposed else shared_16x32_to_local_64x8_layout_A
-            reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x32_to_local_64x8_layout_A if transposed else shared_16x32_to_local_64x8_layout_B
-                reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                )
         elif k_dim == 64:
             index_map = shared_16x64_to_local_64x16_layout_B if transposed else shared_16x64_to_local_64x16_layout_A
-            reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x64_to_local_64x16_layout_A if transposed else shared_16x64_to_local_64x16_layout_B
-                reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                )
         else:
             raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
 
@@ -212,7 +231,7 @@ def get_ldmatrix_index_map(self, is_b=False):
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype="int32")
+        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -226,14 +245,12 @@ def get_thread_binding(self):
         else:
             return self.thread_var
 
-    def extract_thread_binding(self,
-                               thread_id,
-                               is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
-        '''
-            is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
-            which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
-            Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
-        '''
+    def extract_thread_binding(self, thread_id, is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
         WARP_SIZE = self.WARP_SIZE
         block_row_warps = self.block_row_warps
         block_col_warps = self.block_col_warps
@@ -243,16 +260,18 @@ def extract_thread_binding(self,
             is_m_first = self.is_m_first
 
         if is_m_first:
-            lane_id, warp_n, warp_m = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_col_warps, (thread_id //
-                                               (WARP_SIZE * block_col_warps)) % block_row_warps,
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
             return lane_id, warp_n, warp_m
         else:
-            lane_id, warp_m, warp_n = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_row_warps, (thread_id //
-                                               (WARP_SIZE * block_row_warps)) % block_col_warps,
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
             return lane_id, warp_n, warp_m
 
     def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0):
@@ -268,7 +287,7 @@ def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0)
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -286,18 +305,14 @@ def _warp_ldmatrix_a(
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (rk * chunk + ki * (k_pack * micro_size_k),
-                                warp_m * warp_row_tiles + i * micro_size_x)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
             else:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (warp_m * warp_row_tiles + i * micro_size_x,
-                                rk * chunk + ki * (k_pack * micro_size_k))
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
@@ -314,7 +329,7 @@ def ldmatrix_b(self, B_local_buf, B_shared_buf: Buffer | BufferRegion, ki, rk=0)
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -336,8 +351,7 @@ def _warp_ldmatrix_b(
                             warp_n * warp_col_tiles + j * micro_size_y,
                             rk * chunk + ki * (k_pack * micro_size_k),
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
             else:
                 for j in T.serial(warp_cols):
@@ -347,16 +361,11 @@ def _warp_ldmatrix_b(
                             rk * chunk + ki * (k_pack * micro_size_k),
                             warp_n * warp_col_tiles + j * micro_size_y,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mfma(self,
-             A_local_buf: Buffer,
-             B_local_buf: Buffer,
-             C_local_buf: Buffer,
-             k_inner: PrimExpr | None = 0):
+    def mfma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -371,10 +380,8 @@ def mfma(self,
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
-        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
-        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
-
-        print(a_local_stride, b_local_stride)
+        a_local_stride: PrimExpr = k_inner * warp_rows * k_pack * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * k_pack * local_size_b if b_is_fragment else 0
 
         @T.macro
         def _warp_mfma(A_local_buf, B_local_buf, C_local_buf):
@@ -422,14 +429,13 @@ def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
                     if C_buf_dims == 2:
-                        C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                              (warp_n * warp_cols + j) * N_DIM +
-                              col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                 j * local_size_out + local_id]
+                        C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * N_DIM + col] = C_local_buf[
+                            i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                        ]
                     else:
-                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                              col] = C_local_buf[i * warp_cols * local_size_out +
-                                                 j * local_size_out + local_id]
+                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                            i * warp_cols * local_size_out + j * local_size_out + local_id
+                        ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -437,18 +443,17 @@ def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
-                    C_buf[(pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
-                          (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM +
-                          col] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                             local_id]
-
-        return _warp_stmatrix_global(C_local_buf, C_buf,
-                                     thread_binding) if is_global else _warp_stmatrix_shared(
-                                         C_local_buf, C_buf, thread_binding)
-
-    def make_mfma_load_layout(self,
-                              local_buf: Buffer,
-                              matrix: Literal["A", "B"] = "A") -> T.Fragment:
+                    C_buf[
+                        (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col
+                    ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mfma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MFMA results into a fragment buffer.
 
@@ -469,6 +474,7 @@ def make_mfma_load_layout(self,
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -507,11 +513,9 @@ def make_mfma_load_layout(self,
 
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -527,7 +531,7 @@ def make_mfma_load_layout(self,
             self.block_col_warps,
         )
 
-        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -544,7 +548,7 @@ def forward_index(i: int, j: int) -> int:
             return local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
+            [micro_size_s, micro_size_r * self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
@@ -553,36 +557,24 @@ def forward_index(i: int, j: int) -> int:
         chunk = self.chunk
 
         warp_s = warp_rows if matrix_is_a else warp_cols
-        warp_r = chunk // micro_size_r
+        warp_r = chunk // (micro_size_r * self.k_pack)
         block_s = block_row_warps if matrix_is_a else block_col_warps
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -657,14 +649,40 @@ def forward_index(i: int, j: int) -> int:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
 
-class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
 
+class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -678,34 +696,27 @@ def __init__(
         is_m_first: bool | None = False,
         a_preshuffle: bool | None = False,
         b_preshuffle: bool | None = False,
+        thread_var: Var | None = None,
     ):
-
-        self.a_dtype = a_dtype
-        self.b_dtype = b_dtype
-        self.accum_dtype = accum_dtype
-        self.a_transposed = a_transposed
-        self.b_transposed = b_transposed
-        # Hint Information
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-        self._initialize_k_dim(a_dtype)
-        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
-        self._initialize_local_size(self.M_DIM, self.N_DIM, self.k_dim, self.WARP_SIZE)
-        self._initialize_mfma_prefix(self.k_dim)
-        self._initialize_micro_size(self.M_DIM, self.N_DIM, self.k_dim)
-        self._initialize_k_pack(k_pack)
-        self._initialize_is_m_first(is_m_first)
+        super().__init__(
+            a_dtype=a_dtype,
+            b_dtype=b_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=a_transposed,
+            b_transposed=b_transposed,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+            reduce_k=reduce_k,
+            num_elems_per_byte=num_elems_per_byte,
+            k_pack=k_pack,
+            is_m_first=is_m_first,
+            thread_var=thread_var,
+        )
         self._initialize_preshuffle(a_preshuffle, b_preshuffle)
 
-        self.warp_rows = warp_row_tiles // self.micro_size_x
-        self.warp_cols = warp_col_tiles // self.micro_size_y
-        self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
-        self.num_elems_per_byte = num_elems_per_byte
-
     def _initialize_preshuffle(self, a_preshuffle: bool, b_preshuffle: bool):
         if a_preshuffle is not None:
             self.a_preshuffle = a_preshuffle
@@ -772,20 +783,20 @@ def _warp_ldmatrix_a_shared(
                             rk * (chunk // micro_size_k) + ki,
                             warp_m * warp_rows + i,
                         )
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
             else:
                 print(self.a_preshuffle)
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (warp_m * warp_rows + i, rk * (chunk // micro_size_k) + ki)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_a_shared(
-                                           A_local_buf, A_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_a_shared(A_local_buf, A_buf, ki, thread_binding, rk)
+        )
 
     def ldmatrix_b(self, B_local_buf, B_buf, ki, rk=0, pid_m=None, pid_n=None):
         warp_cols = self.warp_cols
@@ -847,8 +858,7 @@ def _warp_ldmatrix_b_shared(
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
             else:
                 for j in T.serial(warp_cols):
                     for local_id in T.vectorized(k_pack * local_size_b):
@@ -857,9 +867,10 @@ def _warp_ldmatrix_b_shared(
                             rk * (chunk // micro_size_k) + ki,
                             warp_n * warp_cols + j,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_b_shared(
-                                           B_local_buf, B_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_b_shared(B_local_buf, B_buf, ki, thread_binding, rk)
+        )
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
index 449b6b943..2eb575f0c 100644
--- a/tilelang/intrinsics/mma_layout.py
+++ b/tilelang/intrinsics/mma_layout.py
@@ -151,12 +151,43 @@ def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
     return row, col
 
 
+def mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
+            groupID + 8         Otherwise
+
+    col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
+    (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
+    """
+    row = (thread_id // 4) + 8 * (local_id % 4 // 2)
+    col = (thread_id % 4) * 2 + (local_id % 2) + 8 * (local_id // 4)
+    return row, col
+
+
 def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
     row = 8 * (local_id // 8) + (thread_id // 4)
     col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
     return row, col
 
 
+def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
+        (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
+
+    col = groupID
+    """
+    col = (thread_id % 4) * 2 + ((local_id % 4) % 2) + ((local_id % 4) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 4)
+    return row, col
+
+
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 8c546c63b..14831050f 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -3,14 +3,16 @@
 from typing import Literal, Callable
 from tilelang.common import TransformKind
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tilelang import tvm as tvm
 from tvm.runtime import convert
 from .utils import (
     mma_store_index_map,
     get_ldmatrix_offset,
 )
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_layout import (
     shared_16x8_to_mma_32x4_layout_sr_a,
     shared_16x8_to_mma_32x4_layout_sr_b,
@@ -20,8 +22,10 @@
     shared_16x32_to_mma_32x16_layout_sr_b,
     mma_load_a_32x4_to_shared_16x8_layout,
     mma_load_b_32x4_to_shared_16x8_layout,
+    mma_load_b_32x8_to_shared_16x16_layout,
     mma_load_a_32x16_to_shared_16x32_layout,
     mma_load_b_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
 )
 
 lift = convert
@@ -43,19 +47,25 @@ class TensorCoreIntrinEmitter:
         "float32": "fp32",
         "float64": "fp64",
         "int8": "int8",
+        "uint8": "uint8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
+        "float8_e4m3fn": "e4m3",
+        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
-    is_m_first = False
+    is_m_first: bool = False
+    warp_rows: int = 1
+    warp_cols: int = 1
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -101,7 +111,7 @@ def __init__(
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
         self.k_dim = 256 // a_dtype.bits
@@ -184,20 +194,18 @@ def get_thread_binding(self):
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         from .utils import mma_store_index_map, mma_store_index_map_fp64
+
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         if DataType(self.accum_dtype).bits == 64:
-            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype=T.int32)
         else:
-            index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -226,11 +234,7 @@ def extract_thread_binding(
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.a_dtype).bits == 64:
             warp_row_tiles = self.warp_row_tiles
@@ -243,10 +247,11 @@ def ldmatrix_a(self,
 
             thread_binding = self.get_thread_binding()
             # legalize shared buffer to region
-            A_region = to_buffer_region(A_shared_buf)
+            A_region = self._legalize_to_buffer_region(A_shared_buf)
             A_buf = A_region.buffer
             A_base0 = A_region.region[-2].min
             A_base1 = A_region.region[-1].min
+            A_other = [r.min for r in A_region.region[:-2]]
 
             @T.macro
             def _warp_ld_a_fp64(
@@ -263,9 +268,9 @@ def _warp_ld_a_fp64(
                     mi = tx // micro_size_k
                     mk = tx % micro_size_k
                     if a_transposed:
-                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
+                        A_local_buf[i * local_size_a] = A_buf[tuple(A_other) + (A_base0 + wk + mk, A_base1 + wi + mi)]
                     else:
-                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+                        A_local_buf[i * local_size_a] = A_buf[tuple(A_other) + (A_base0 + wi + mi, A_base1 + wk + mk)]
 
             return _warp_ld_a_fp64(A_local_buf, A_region, ki, thread_binding, rk)
 
@@ -286,6 +291,8 @@ def mma_load_layout(i, j):
         if not ldmatrix_available:
             if DataType(a_dtype).bits == 8:
                 mma_load_layout = mma_load_a_32x16_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_load_a_32x8_to_shared_16x16_layout
             elif DataType(a_dtype).bits == 32:
                 mma_load_layout = mma_load_a_32x4_to_shared_16x8_layout
             else:
@@ -294,10 +301,11 @@ def mma_load_layout(i, j):
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
+        A_other = [r.min for r in A_region.region[:-2]]
         A_stride_last = A_buf.shape[-1]
 
         @T.macro
@@ -315,9 +323,11 @@ def _warp_ldmatrix_a(
             for i in T.serial(warp_rows):
                 # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
-                A_shared_buf_elem = A_buf[A_base0 + wk,
-                                          A_base1 + wi] if a_transposed else A_buf[A_base0 + wi,
-                                                                                   A_base1 + wk]
+                A_shared_buf_elem = (
+                    A_buf[tuple(A_other) + (A_base0 + wk, A_base1 + wi)]
+                    if a_transposed
+                    else A_buf[tuple(A_other) + (A_base0 + wi, A_base1 + wk)]
+                )
 
                 if ldmatrix_available:
                     T.ptx_ldmatrix(
@@ -334,20 +344,13 @@ def _warp_ldmatrix_a(
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
                         if a_transposed:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk,
-                                                                      A_base1 + wi + mi]
+                            A_local_buf[i * local_size_a + j] = A_buf[tuple(A_other) + (A_base0 + wk + mk, A_base1 + wi + mi)]
                         else:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi,
-                                                                      A_base1 + wk + mk]
+                            A_local_buf[i * local_size_a + j] = A_buf[tuple(A_other) + (A_base0 + wi + mi, A_base1 + wk + mk)]
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
-
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.b_dtype).bits == 64:
             warp_col_tiles = self.warp_col_tiles
@@ -360,10 +363,11 @@ def ldmatrix_b(self,
             thread_binding = self.get_thread_binding()
 
             # legalize shared buffer to region
-            B_region = to_buffer_region(B_shared_buf)
+            B_region = self._legalize_to_buffer_region(B_shared_buf)
             B_buf = B_region.buffer
             B_base0 = B_region.region[-2].min
             B_base1 = B_region.region[-1].min
+            B_other = [r.min for r in B_region.region[:-2]]
 
             @T.macro
             def _warp_ld_b_fp64(
@@ -380,9 +384,9 @@ def _warp_ld_b_fp64(
                     mi = tx // micro_size_k
                     mk = tx % micro_size_k
                     if b_transposed:
-                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                        B_local_buf[j * local_size_b] = B_buf[tuple(B_other) + (B_base0 + wi + mi, B_base1 + wk + mk)]
                     else:
-                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
+                        B_local_buf[j * local_size_b] = B_buf[tuple(B_other) + (B_base0 + wk + mk, B_base1 + wi + mi)]
 
             return _warp_ld_b_fp64(B_local_buf, B_region, ki, thread_binding, rk)
 
@@ -397,12 +401,13 @@ def _warp_ld_b_fp64(
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
+        B_other = [r.min for r in B_region.region[:-2]]
         B_stride_last = B_buf.shape[-1]
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
 
@@ -412,6 +417,8 @@ def mma_load_layout(i, j):
         if not ldmatrix_available:
             if DataType(b_dtype).bits == 8:
                 mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout
             elif DataType(b_dtype).bits == 32:
                 mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
@@ -437,9 +444,11 @@ def _warp_ldmatrix_b(
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_buf[B_base0 + wi,
-                                              B_base1 + wk] if b_transposed else B_buf[B_base0 + wk,
-                                                                                       B_base1 + wi]
+                    B_shared_buf_elem = (
+                        B_buf[tuple(B_other) + (B_base0 + wi, B_base1 + wk)]
+                        if b_transposed
+                        else B_buf[tuple(B_other) + (B_base0 + wk, B_base1 + wi)]
+                    )
 
                     T.ptx_ldmatrix(
                         b_dtype,
@@ -458,19 +467,13 @@ def _warp_ldmatrix_b(
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
                         if b_transposed:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                      B_base1 + wk + mk]
+                            B_local_buf[i * local_size_b + j] = B_buf[tuple(B_other) + (B_base0 + wi + mi, B_base1 + wk + mk)]
                         else:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                      B_base1 + wi + mi]
+                            B_local_buf[i * local_size_b + j] = B_buf[tuple(B_other) + (B_base0 + wk + mk, B_base1 + wi + mi)]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -481,7 +484,7 @@ def mma(self,
         accum_dtype = self.accum_dtype
         accum_dtype_abbrv = self.accum_dtype_abbrv
         mma_prefix = self.mma_prefix
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
@@ -521,8 +524,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                         B_local_buf.data,
                         b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                         C_local_buf.data,
-                        i * warp_cols * local_size_out + j * local_size_out +
-                        lift(local_size_out) // 2,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
                         T.bool(False),  # saturate
                     )
 
@@ -557,14 +559,13 @@ def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
                         local_id = local_id_o * 2 + local_id_i
                         row, col = T.meta_var(mma_store_index_map(tx, local_id))
                         if C_buf_dims == 2:
-                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                                  (warp_n * warp_cols + j) * n_dim +
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
                         else:
-                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -577,15 +578,15 @@ def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
                         C_buf[
                             (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
                             (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
-                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                        local_id]
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
 
-        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
-                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -608,6 +609,7 @@ def make_mma_load_layout(self,
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -644,11 +646,9 @@ def make_mma_load_layout(self,
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -664,7 +664,7 @@ def make_mma_load_layout(self,
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -695,31 +695,19 @@ def forward_index(i: int, j: int) -> int:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -750,8 +738,7 @@ def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
         from tilelang.utils import is_fragment
 
         shape = local_buf.shape
-        assert is_fragment(
-            local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
+        assert is_fragment(local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
         inverse_mma_store_layout = self.get_store_index_map(inverse=True)
 
         micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
@@ -798,6 +785,33 @@ def forward_index(i: int, j: int) -> int:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
 
 class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
     """
@@ -807,9 +821,9 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -840,7 +854,7 @@ def __init__(
         )
         self._initialize_transform_kind(transform_kind_a, transform_kind_b)
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 256 // DataType(a_dtype).bits
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
@@ -916,10 +930,12 @@ def _warp_ldmatrix_a(
                         ".b16",
                         A_local_buf.data,
                         i * local_size_a,
-                        T.address_of(A_shared_buf[
-                            warp_m * warp_row_tiles + i * micro_size_x,
-                            rk * chunk + ki * micro_size_k,
-                        ]),
+                        T.address_of(
+                            A_shared_buf[
+                                warp_m * warp_row_tiles + i * micro_size_x,
+                                rk * chunk + ki * micro_size_k,
+                            ]
+                        ),
                         get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
                     )
             elif transform_kind_a == TransformKind.InterWarpTransform:
@@ -981,10 +997,8 @@ def _warp_ldmatrix_a(
                             warp_m * warp_rows + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_a +
-                                    local_id) // micro_size_k, (tx * local_size_a + local_id) % (
-                                        micro_size_k)
-                        A_local_buf[j * local_size_a + local_id] = (A_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (tx * local_size_a + local_id) // micro_size_k, (tx * local_size_a + local_id) % (micro_size_k)
+                        A_local_buf[j * local_size_a + local_id] = A_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input A")
 
@@ -1093,12 +1107,11 @@ def _warp_ldmatrix_b(
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_dequantize +
-                                    local_id) // (micro_size_k // num_elems_per_byte), (
-                                        tx * local_size_dequantize + local_id) % (
-                                            micro_size_k // num_elems_per_byte)
-                        B_local_buf[j * local_size_dequantize + local_id] = (
-                            B_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (
+                            (tx * local_size_dequantize + local_id) // (micro_size_k // num_elems_per_byte),
+                            (tx * local_size_dequantize + local_id) % (micro_size_k // num_elems_per_byte),
+                        )
+                        B_local_buf[j * local_size_dequantize + local_id] = B_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input B")
 
@@ -1157,7 +1170,6 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
 
 
 class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
@@ -1260,9 +1272,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
 
 
 class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWithLadderTransform):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
-
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -1271,7 +1281,7 @@ def mma(self, A_local_buf, B_local_buf, C_local_buf):
         a_dtype_abbrv = "int4"
         b_dtype_abbrv = "int4"
         accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = "int32"
+        accum_dtype_abbrv = T.int32
         mma_prefix = "m16n8k32"
 
         @T.macro
diff --git a/tilelang/intrinsics/mma_sm70_layout.py b/tilelang/intrinsics/mma_sm70_layout.py
index d6491c2bd..802923441 100644
--- a/tilelang/intrinsics/mma_sm70_layout.py
+++ b/tilelang/intrinsics/mma_sm70_layout.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 def shared_16x4_to_mma_a_32x4_layout(row, col, rep):
     tid = (row % 4) + 16 * ((row // 4) % 2) + 4 * (row // 8) + 8 * rep
     local_id = col
@@ -20,10 +17,8 @@ def shared_16x4_to_mma_b_32x4_layout_trans(row, col, rep):
 
 
 def mma_32x8_to_shared_16x16_layout_fp32(thread_id, local_id):
-    row = (thread_id % 2) + (
-        (local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
-    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id %
-                                                                  2) + (local_id // 4) * 8
+    row = (thread_id % 2) + ((local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
+    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id % 2) + (local_id // 4) * 8
     return row, col
 
 
@@ -34,7 +29,7 @@ def mma_32x8_to_shared_16x16_layout_fp16(thread_id, local_id):
 
 
 def mma_load_a_32x4_to_shared_16x4_layout(thread_id, local_id):
-    row = (thread_id % 4) + (4 * (((thread_id // 16 + thread_id % 16 // 4 * 2)) % 4))
+    row = (thread_id % 4) + (4 * ((thread_id // 16 + thread_id % 16 // 4 * 2) % 4))
     col = local_id
     return row, col
 
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index b20a6a900..52679b169 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -2,10 +2,12 @@
 import tilelang.language as T
 from typing import Literal, Callable
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tilelang import tvm as tvm
 from tvm.runtime import convert
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_sm70_layout import (
     shared_16x4_to_mma_a_32x4_layout,
     shared_4x16_to_mma_b_32x4_layout,
@@ -46,9 +48,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -89,7 +91,7 @@ def __init__(
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 4
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16):
@@ -147,18 +149,15 @@ def get_thread_binding(self):
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         index_map = IndexMap.from_func(
-            mma_32x8_to_shared_16x16_layout_fp32
-            if self.accum_dtype == "float32" else mma_32x8_to_shared_16x16_layout_fp16,
-            index_dtype="int32")
+            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == T.float32 else mma_32x8_to_shared_16x16_layout_fp16,
+            index_dtype=T.int32,
+        )
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -187,11 +186,7 @@ def extract_thread_binding(
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         chunk = self.chunk
@@ -207,7 +202,7 @@ def ldmatrix_a(self,
         mma_load_layout = mma_load_a_32x4_to_shared_16x4_layout
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -231,11 +226,7 @@ def _warp_ldmatrix_a(
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         chunk = self.chunk
@@ -248,7 +239,7 @@ def ldmatrix_b(self,
         mma_load_layout = mma_load_b_32x4_to_shared_16x4_layout_trans if b_transposed else mma_load_b_32x4_to_shared_4x16_layout
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -274,20 +265,14 @@ def _warp_ldmatrix_b(
                 for j in T.vectorized(local_size_b):
                     if b_transposed:
                         mi, mk = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                  B_base1 + wk + mk]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
                     else:
                         mk, mi = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                  B_base1 + wi + mi]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_region, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -326,9 +311,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
 
         return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -351,6 +334,7 @@ def make_mma_load_layout(self,
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -383,11 +367,9 @@ def make_mma_load_layout(self,
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(
-                i, j)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(i, j)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -403,7 +385,7 @@ def make_mma_load_layout(self,
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward(i: int, j: int, rep: int) -> int:
             """
@@ -413,9 +395,8 @@ def forward(i: int, j: int, rep: int) -> int:
             return lane_id, local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
-            forward_fn=forward,
-            replicate=2)
+            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s], forward_fn=forward, replicate=2
+        )
 
         warp_rows, warp_cols = self.warp_rows, self.warp_cols
         chunk = self.chunk
@@ -426,31 +407,19 @@ def forward(i: int, j: int, rep: int) -> int:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -526,3 +495,30 @@ def forward_index(i: int, j: int) -> int:
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
+
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
new file mode 100644
index 000000000..58034e7fd
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -0,0 +1,181 @@
+from tvm import DataType
+from typing import Literal
+
+from tilelang.intrinsics.mma_layout import (
+    mma_load_a_32x4_to_shared_16x8_layout,
+    mma_load_a_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
+
+
+def shared_16x16_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x8_to_mma_32x4_layout_sr_a(i, j)
+
+
+def shared_16x16_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 4)
+    return thread_id, 4 * (i // 8) + (j // 4)
+
+
+def shared_16x32_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x16_to_mma_32x8_layout_sr_a(i, j)
+
+
+def shared_16x32_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 8 * (i // 8) + (j // 8) * 2 + (j % 2)
+
+
+def shared_16x64_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x32_to_mma_32x16_layout_sr_a(i, j)
+
+
+def shared_16x64_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 16 * (i // 8) + (j // 16) * 4 + j % 4
+
+
+def mma_sp_load_a_32x4_to_shared_16x16_layout(thread_id, local_id):
+    return mma_load_a_32x4_to_shared_16x8_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x8_to_shared_16x32_layout(thread_id, local_id):
+    return mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x16_to_shared_16x64_layout(thread_id, local_id):
+    return mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id)
+
+
+def mma_sp_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    col = 4 * (local_id % 4) + (thread_id % 4)
+    row = 8 * (local_id // 4) + (thread_id // 4)
+    return row, col
+
+
+def mma_sp_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    col = (thread_id % 4) * 2 + (local_id % 2) + ((local_id % 8) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 8)
+    return row, col
+
+
+def mma_sp_load_b_32x32_to_shared_16x64_layout(thread_id, local_id):
+    col = (thread_id % 4) * 4 + (local_id % 4) + 16 * ((local_id % 16) // 4)
+    row = (thread_id // 4) + 8 * (local_id // 16)
+    return row, col
+
+
+def get_logical_id_32bit(thread_id: int) -> int:
+    return (thread_id // 4) * 2 + (thread_id % 4) % 2
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 4 + local_id * 8
+    col = logical_id % 4
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = logical_id % 2
+    return row, col
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def get_logical_id_8bit(thread_id: int) -> int:
+    return thread_id
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 4 + local_id
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 2 + local_id
+    return row, col
+
+
+def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    # local_id is always 0
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 4 + (logical_id % 2) * 8
+    col = (logical_id % 4) // 2
+    return row, col
+
+
+def ldmatrix_trans_32x8_to_shared_16x16_layout(thread_id, local_id):
+    row = (local_id // 4) * 8 + thread_id % 8
+    col = (thread_id // 8) * 4 + local_id % 4
+    return row, col
+
+
+def ldmatrix_32x16_to_shared_32x16_layout(thread_id, local_id):
+    row = thread_id
+    col = local_id % 8 + 8 * (local_id // 8)
+    return row, col
+
+
+def ldmatrix_trans_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id // 8) + thread_id % 8
+    col = (thread_id // 8) * 8 + local_id % 8
+    return row, col
+
+
+def ldmatrix_trans_32x32_to_shared_shared_16x64_layout(thread_id, local_id):
+    row = (local_id // 16) * 8 + thread_id % 8
+    col = (thread_id // 8) * 16 + local_id % 16
+    return row, col
+
+
+def get_ldmatrix_offset_b(
+    matrix: Literal["B"],
+    row_idx,
+    col_idx,
+    stride,
+    dtype: Literal["float16", "int8"] = "float16",
+    transposed: bool = False,
+):
+    assert matrix == "B", "matrix should be B"
+    dtype_bits = DataType(dtype).bits
+    if dtype_bits == 32:
+        if transposed:
+            transform_func = ldmatrix_trans_32x8_to_shared_16x16_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 32-bit dtype")
+    elif dtype_bits == 16:
+        transform_func = ldmatrix_32x16_to_shared_32x16_layout
+        transform_func_trans = ldmatrix_trans_32x16_to_shared_16x32_layout
+        if transposed:
+            new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+    elif dtype_bits == 8:
+        if transposed:
+            transform_func = ldmatrix_trans_32x32_to_shared_shared_16x64_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 8-bit dtype")
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
new file mode 100644
index 000000000..3e375b46b
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -0,0 +1,831 @@
+from __future__ import annotations
+
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm import DataType
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm.runtime import convert
+from .utils import (
+    mma_store_index_map,
+    get_ldmatrix_offset,
+)
+from tilelang.utils import is_fragment
+
+from tilelang.intrinsics.mma_sp_layout import (
+    shared_16x16_to_mma_sp_layout_sr_a,
+    shared_16x16_to_mma_sp_layout_sr_b,
+    shared_16x32_to_mma_sp_layout_sr_a,
+    shared_16x32_to_mma_sp_layout_sr_b,
+    shared_16x64_to_mma_sp_layout_sr_a,
+    shared_16x64_to_mma_sp_layout_sr_b,
+    mma_sp_load_a_32x4_to_shared_16x16_layout,
+    mma_sp_load_a_32x8_to_shared_16x32_layout,
+    mma_sp_load_a_32x16_to_shared_16x64_layout,
+    mma_sp_load_b_32x8_to_shared_16x16_layout,
+    mma_sp_load_b_32x16_to_shared_16x32_layout,
+    mma_sp_load_b_32x32_to_shared_16x64_layout,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_32bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_32bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_16bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_16bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_8bit,
+    metadata_16bit_load_32x2_to_shared_16x4_layout_8bit,
+    metadata_32bit_load_32x1_to_shared_16x2_layout_8bit,
+    get_ldmatrix_offset_b,
+)
+
+lift = convert
+
+
+class SparseTensorCoreIntrinEmitter:
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    M_DIM = 16
+    SPARSE_FACTOR = 2  # 1:2 for tfloat12, 2:4 for 16-bit and 8-bit datatypes
+    SPARSE_SELECTOR = 0  # always use lower threads to provide metadata
+    # use lowercase as n_dim can be dynamic
+    # the smallest instructions can be m16n8k16, so the n_dim can also be 8
+    n_dim = 16
+    WARP_SIZE = 32
+    dtype_abbrv = {
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "int8": "int8",
+        "int32": "int32",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
+    }
+
+    E_FACTOR_MAP = {  # e_kdim = mma_kdim // e_factor
+        "float": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float32": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "bfloat16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "int8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "uint8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e4m3": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e5m2": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+    }
+
+    E_REPLICATE_FACTOR = {  # metadata replicate every 4 consecutive threads
+        "float32": 2,
+        "float16": 2,  # 2 of 4 consecutive threads provides
+        "bfloat16": 2,
+        "int8": 1,  # 4 of 4 consecutive threads provides
+        "uint8": 1,
+        "float8_e4m3": 1,
+        "float8_e5m2": 1,
+    }
+
+    # Represent the thread binding in the form of (tx, warp_n, warp_m)
+    is_m_first = False
+
+    def __init__(
+        self,
+        a_dtype: str = T.float16,
+        e_dtype: str = T.uint8,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        e_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        warp_k: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool = False,
+        thread_var: Var | None = None,
+    ):
+        self.a_dtype = a_dtype
+        self.e_dtype = e_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        self.e_transposed = e_transposed
+        # Hint Information
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.warp_k = warp_k
+        self.e_factor = self.E_FACTOR_MAP[self.a_dtype][self.e_dtype]
+        self._initialize_k_dim(a_dtype)
+        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
+        self._initialize_micro_size(self.M_DIM, self.k_dim)
+        self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
+        self._initialize_mma_sp_prefix(self.k_dim)
+        self._initialize_is_m_first(is_m_first)
+
+        self.reduce_k = reduce_k
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
+        self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
+
+        if self.warp_rows == 0 or self.warp_cols == 0:
+            raise ValueError(
+                f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
+            )
+
+    def _initialize_k_dim(self, a_dtype=T.float16):
+        if isinstance(a_dtype, str):
+            a_dtype = DataType(a_dtype)
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        # When referring to the physical data movement, it should be divided by sparse_factor.
+        self.k_dim = 256 // a_dtype.bits * self.SPARSE_FACTOR
+
+    def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
+        self.local_size_a = (m_dim * k_dim) // warp_size // self.SPARSE_FACTOR
+        self.local_size_e = (m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
+        self.local_size_b = (n_dim * k_dim) // warp_size
+        self.local_size_out = (m_dim * n_dim) // warp_size
+
+    def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
+        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
+        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
+        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+
+    def _initialize_mma_sp_prefix(self, k_dim: int = 16):
+        if k_dim == 16:
+            # typically used for tfloat32
+            self.mma_prefix = "m16n8k16"
+        elif k_dim == 32:
+            # typically used for float16/bfloat16
+            self.mma_prefix = "m16n8k32"
+        elif k_dim == 64:
+            # typically used for int8/fp8
+            self.mma_prefix = "m16n8k64"
+        else:
+            raise ValueError("Unsupported k_dim")
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+        self.warp_rows = warp_row_tiles // m_dim
+
+        if warp_col_tiles % 16 == 0:
+            self.n_dim = 16
+            self.micro_size_y = 16
+            self.warp_cols = warp_col_tiles // 16
+        else:
+            # must be divisible by 8
+            self.n_dim = 8
+            self.micro_size_y = 8
+            self.warp_cols = warp_col_tiles // 8
+
+        self.micro_size_x = m_dim
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        self.micro_size_k = k_dim
+
+    def _initialize_is_m_first(self, is_m_first: bool | None = False):
+        if is_m_first is not None:
+            self.is_m_first = is_m_first
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+
+        # if is_m_first is None, then use the default value
+        if is_m_first is None:
+            is_m_first = self.is_m_first
+
+        if is_m_first:
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
+            return lane_id, warp_n, warp_m
+        else:
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
+            return lane_id, warp_n, warp_m
+
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        a_dtype = self.a_dtype
+        a_transposed = self.a_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(a_dtype).bits != 16 and a_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(a_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_a_32x16_to_shared_16x64_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_a_32x8_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_a_32x4_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {a_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_a(
+            A_local_buf,
+            A_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = A_shared_buf.shape[-1]
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            trans = self.a_transposed
+
+            for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
+                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+
+                if ldmatrix_available:
+                    T.ptx_ldmatrix(
+                        a_dtype,
+                        T.bool(trans),
+                        4,
+                        ".b16",
+                        A_local_buf.data,
+                        i * local_size_a,
+                        T.address_of(A_shared_buf_elem),
+                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                    )
+                else:
+                    for j in T.serial(local_size_a):
+                        mi, mk = mma_load_layout(tx, j)
+                        A_local_buf[i * local_size_a + j] = (
+                            A_shared_buf[wk + mk, wi + mi] if a_transposed else A_shared_buf[wi + mi, wk + mk]
+                        )
+
+        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_e = self.local_size_e
+        a_dtype = self.a_dtype
+        e_dtype = self.e_dtype
+        trans = self.e_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        # include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+        ldmatrix_available = False  # TODO: use ldmatrix when possible
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(e_dtype).bits == 8:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 8bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 16:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 16bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 32:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_32bit_load_32x1_to_shared_16x2_layout_8bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 32bit: {a_dtype}")
+            else:
+                raise ValueError(f"Unsupported dtype: {e_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_e(
+            E_local_buf,
+            E_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            for i in T.serial(warp_rows):
+                # Assign E_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.e_factor
+                for j in T.serial(local_size_e):
+                    mi, mk = mma_load_layout(tx, j)
+                    E_local_buf[i * local_size_e + j] = E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
+
+        return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        warp_k = self.warp_k
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        b_dtype = self.b_dtype
+        b_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+        replicate_b = self.n_dim == 16
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(b_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_b_32x32_to_shared_16x64_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_b_32x8_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {b_dtype}")
+
+        @T.macro
+        def _warp_ldmatrix_b(
+            B_local_buf,
+            B_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = B_shared_buf.shape[-1]
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            trans = not b_transposed
+
+            for i in T.serial(warp_cols):
+                # Assign B_shared_elem
+                wi, wk = (
+                    warp_n * warp_col_tiles + i * micro_size_y,
+                    rk * warp_k + ki * micro_size_k,
+                )
+
+                if ldmatrix_available:
+                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
+
+                    if replicate_b:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b + lift(local_size_b) // 2,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed),
+                        )
+                    else:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                else:
+                    # load 16x32 data from shared buffer to local buffer
+                    # must be transposed.
+                    for j in T.serial(local_size_b):
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = (
+                            B_shared_buf[wi + mi, wk + mk] if b_transposed else B_shared_buf[wk + mk, wi + mi]
+                        )
+
+        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+
+    def mma_sp(self, A_local_buf: Buffer, E_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_e = self.local_size_e
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        a_dtype_abbrv = self.a_dtype_abbrv
+        b_dtype_abbrv = self.b_dtype_abbrv
+        accum_dtype = self.accum_dtype
+        accum_dtype_abbrv = self.accum_dtype_abbrv
+        mma_prefix = self.mma_prefix
+        replicate_b = self.n_dim == 16
+
+        a_is_fragment = is_fragment(A_local_buf)
+        e_is_fragment = is_fragment(E_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        assert not e_is_fragment, f"currently E_local_buf must be a local allocation, found {E_local_buf.scope()}"
+        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
+        e_local_stride: PrimExpr = k_inner * warp_rows * local_size_e if e_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+
+        @T.macro
+        def _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf):
+            for i, j in T.grid(warp_rows, warp_cols):
+                T.ptx_mma_sp(
+                    accum_dtype,
+                    mma_prefix,
+                    "row",
+                    "col",
+                    a_dtype_abbrv,
+                    b_dtype_abbrv,
+                    accum_dtype_abbrv,
+                    A_local_buf.data,
+                    a_local_stride + i * local_size_a,
+                    B_local_buf.data,
+                    b_local_stride + j * local_size_b,
+                    C_local_buf.data,
+                    i * warp_cols * local_size_out + j * local_size_out,
+                    E_local_buf.data,  # metadata
+                    e_local_stride + i * local_size_e,  # metadata offset
+                    self.SPARSE_SELECTOR,  # sparse_selector
+                    T.bool(False),  # saturate
+                )
+                if replicate_b:
+                    T.ptx_mma_sp(
+                        accum_dtype,
+                        mma_prefix,
+                        "row",
+                        "col",
+                        a_dtype_abbrv,
+                        b_dtype_abbrv,
+                        accum_dtype_abbrv,
+                        A_local_buf.data,
+                        a_local_stride + i * local_size_a,
+                        B_local_buf.data,
+                        b_local_stride + j * local_size_b + lift(local_size_b) // 2,
+                        C_local_buf.data,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
+                        E_local_buf.data,  # metadata
+                        e_local_stride + i * local_size_e,  # metadata offset
+                        self.SPARSE_SELECTOR,  # sparse_selector
+                        T.bool(False),  # saturate
+                    )
+
+        return _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf)
+
+    def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_out = self.local_size_out
+
+        is_global = pid_m is not None and pid_n is not None
+        BLOCK_M = block_row_warps * warp_rows
+        BLOCK_N = block_col_warps * warp_cols
+        M_DIM, n_dim = self.M_DIM, self.n_dim
+        C_buf_dims = len(C_buf.shape)
+        assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
+
+        thread_binding = self.get_thread_binding()
+
+        # STS
+        # MMA Store must be in simulated instead of TVM Intrins
+        # As TVM Intrins is like a hack that the threadIdx.x should be always
+        # equal to the warp_size
+        @T.macro
+        def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        if C_buf_dims == 2:
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+                        else:
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+
+        @T.macro
+        def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        C_buf[
+                            (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
+                            (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        dtype = self.a_dtype if matrix_is_a else self.b_dtype
+        dtype_bits = DataType(dtype).bits
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
+        if dtype_bits == 32:
+            transform_func_sr_a = shared_16x16_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x16_to_mma_sp_layout_sr_b
+        elif dtype_bits == 16:
+            transform_func_sr_a = shared_16x32_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x32_to_mma_sp_layout_sr_b
+        elif dtype_bits == 8:
+            transform_func_sr_a = shared_16x64_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x64_to_mma_sp_layout_sr_b
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
+
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        # the layout of mma.sync is row.col.
+        # so the b matrix expected a transposed basic layout
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, _ = inverse_mma_load_layout.map_indices([i, j])
+            return lane_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            _, local_id = inverse_mma_load_layout.map_indices([i, j])
+            return local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r]
+            if is_sr_axis_order
+            else [micro_size_r // 2 if matrix_is_a else micro_size_r, micro_size_s],
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.warp_k
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // micro_size_r
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index b742b7eed..a6916f05e 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -73,9 +73,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -88,9 +88,22 @@ def __init__(
         is_m_first: bool = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
 
     def _assign_a_shared_layout(self, layout: Layout):
         self.a_shared_layout = layout
@@ -101,25 +114,7 @@ def _assign_b_shared_layout(self, layout: Layout):
         return self
 
     def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
-        warp_row_tiles = self.warp_row_tiles
-        warp_col_tiles = self.warp_col_tiles
-        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
-        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
-        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
-        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
-
-        # four warps per block
-        self.warp_rows = warp_row_tiles // m_dim
-        if warp_col_tiles % 16 == 0:
-            self.n_dim = 16
-            self.micro_size_y = 16
-            self.warp_cols = warp_col_tiles // 16
-        else:
-            # must be divisible by 8
-            self.n_dim = 8
-            self.micro_size_y = 8
-            self.warp_cols = warp_col_tiles // 8
-
+        # tcgen05 doesn't care about warp partitioning
         self.micro_size_x = m_dim
         self.micro_size_k = k_dim
 
@@ -136,13 +131,7 @@ def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMo
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def tcgen05mma(self,
-                   A_buf: Buffer,
-                   B_buf: Buffer,
-                   C_local_buf: Buffer,
-                   mbar,
-                   clear_accum: PrimExpr = False):
-
+    def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
         if is_tensor_memory(A_buf):
             return self.tcgen05mma_rs(A_buf, B_buf, C_local_buf, clear_accum)
 
@@ -163,23 +152,20 @@ def tcgen05mma(self,
         elems_in_bits = DataType(self.a_dtype).bits
         elems_in_bytes = elems_in_bits // 8
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
-                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, atom_k = (int(x) for x in meta)
-        enable_ws = atom_m != 128
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -202,11 +188,8 @@ def tcgen05mma(self,
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -243,9 +226,13 @@ def tcgen05mma(self,
         )
         # Allocate an instruction descriptor wrapper and initialize it
         a_dtype_abbrv = self.a_dtype_abbrv
-        mask_zero = T.Cast("int32", 0)
+        mask_zero = T.cast(0, T.int32)
         mask0 = mask1 = mask2 = mask3 = mask_zero
 
+        # TCGEN05 only has one warp group
+        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m
+        num_inst_n = self.block_col_warps * self.warp_col_tiles // atom_n
+
         # Helper to allow BufferRegion/BufferLoad as inputs
         def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
             if isinstance(buffer_or_load_or_region, Buffer):
@@ -302,37 +289,49 @@ def _warp_mma(A_buf, B_buf, C_local_buf, mbar):
                 int(b_swizzle_mode),
             )
 
-            for ki in T.serial(0, (k_dim // micro_size_k)):
-                scale_out = T.if_then_else(ki != 0, 1, T.if_then_else(clear_accum, 0, 1))
-                for i in T.serial(m_dim // atom_m):
-                    A_elem_offset = (
-                        ki % ak_atom_size
-                    ) * micro_size_k + i * atom_m * a_swizzle_atom_elems + (
-                        ki // ak_atom_size
-                    ) * m_dim * a_swizzle_atom_elems if a_is_k_major else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
-                    B_elem_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                        ki % bk_atom_size
-                    ) * micro_size_k if b_is_k_major else ki * b_swizzle_atom_elems * micro_size_k
-                    A_byte_offset = A_elem_offset * elems_in_bytes
-                    B_byte_offset = B_elem_offset * elems_in_bytes
-                    C_offset = i * atom_n * accum_dtype_in_bits // 32  # 32 bits per tmem bank
-
-                    T.ptx_tcgen05_mma_ss(
-                        a_dtype_abbrv,
-                        desc_a.data,
-                        A_byte_offset,
-                        desc_b.data,
-                        B_byte_offset,
-                        C_local_buf.data,
-                        C_offset,
-                        instr_desc,
-                        scale_out,
-                        mask0,
-                        mask1,
-                        mask2,
-                        mask3,
-                        enable_ws,
-                    )
+            tmem_col_step = atom_n // (128 // atom_m)
+            for j in T.unroll(num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(0, (k_dim // micro_size_k)):
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+                        A_elem_offset = (
+                            (ki % ak_atom_size) * micro_size_k
+                            + i * atom_m * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k + j * atom_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+
+                        A_byte_offset = A_elem_offset * elems_in_bytes
+                        B_byte_offset = B_elem_offset * elems_in_bytes
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
+
+                        T.ptx_tcgen05_mma_ss(
+                            a_dtype_abbrv,
+                            desc_a.data,
+                            A_byte_offset,
+                            desc_b.data,
+                            B_byte_offset,
+                            C_local_buf.data,
+                            C_offset,
+                            instr_desc,
+                            scale_out,
+                            mask0,
+                            mask1,
+                            mask2,
+                            mask3,
+                            enable_ws,
+                        )
             T.tcgen05_mma_arrive(mbar)
 
         return _warp_mma(A_buf, B_buf, C_local_buf, mbar)
@@ -362,23 +361,21 @@ def make_mma_store_layout(self, tmem_buf: Buffer) -> Layout:
         """
         assert is_tensor_memory(tmem_buf), "tmem_buf must reside in tensor memory (shared.tmem)"
         if len(tmem_buf.shape) != 2:
-            raise ValueError(
-                f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
+            raise ValueError(f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
 
         m = int(tmem_buf.shape[0])
         n = int(tmem_buf.shape[1])
         k = int(self.chunk)
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
-        if len(meta) != 3:
-            raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
-                             f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, _ = (int(x) for x in meta)
-
-        if m % atom_m != 0 or n % atom_n != 0:
+        if len(meta) != 5:
             raise ValueError(
-                f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})"
+                f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
             )
+        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
+
+        if m % atom_m != 0 or n % atom_n != 0:
+            raise ValueError(f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})")
 
         def forward(i: PrimExpr, j: PrimExpr):
             atom_idx = (i // atom_m) + (j // atom_n) * (m // atom_m)
@@ -411,11 +408,11 @@ def forward(i: PrimExpr, j: PrimExpr):
         return Layout([m, n], forward)
 
     def get_tcgen5_mma_meta(self, m: int, n: int, k: int):
-        return _ffi_api.get_tcgen5_mma_meta(
-            int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
+        return _ffi_api.get_tcgen5_mma_meta(int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
 
-    def get_tcgen5_instr_desc(self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool,
-                              b_is_k_major: bool, scale_in_a: int, scale_in_b: int) -> PrimExpr:
+    def get_tcgen5_instr_desc(
+        self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool, b_is_k_major: bool, scale_in_a: int, scale_in_b: int
+    ) -> PrimExpr:
         desc = _ffi_api.get_tcgen5_instr_desc(
             atom_m,
             atom_n,
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index 7fc9bab13..fb24a4add 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -10,7 +10,7 @@
     mma_store_32x8_to_shared_16x16_layout,
     mma_store_32x2_to_shared_8x8_layout_fp64,
 )
-from .mfma_layout import (thread_id_shared_access_64x4_to_16x16_layout_C_n_m)
+from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m
 
 from .mma_layout import get_swizzle_layout  # noqa: F401
 from .mma_layout import make_mma_swizzle_layout  # noqa: F401
diff --git a/tilelang/intrinsics/wgmma_macro_generator.py b/tilelang/intrinsics/wgmma_macro_generator.py
index 51a90fba1..864420c77 100644
--- a/tilelang/intrinsics/wgmma_macro_generator.py
+++ b/tilelang/intrinsics/wgmma_macro_generator.py
@@ -15,9 +15,11 @@
     make_linear_layout,
 )
 from tvm.runtime import convert
-from tilelang.intrinsics.mma_layout import (shared_16x8_to_mma_32x4_layout_sr_a,
-                                            shared_16x16_to_mma_32x8_layout_sr_a,
-                                            shared_16x32_to_mma_32x16_layout_sr_a)
+from tilelang.intrinsics.mma_layout import (
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
 
 lift = convert
 
@@ -81,9 +83,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -96,9 +98,22 @@ def __init__(
         is_m_first: bool | None = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
         self._initialize_wgmma_prefix(self.n_dim)
 
     def _assign_a_shared_layout(self, layout: Layout):
@@ -112,12 +127,12 @@ def _assign_b_shared_layout(self, layout: Layout):
     def _initialize_wgmma_prefix(self, n_dim: int = 16):
         inst_m, inst_n = 64, gcd(self.warp_col_tiles, 256)
         assert inst_n % 8 == 0, (
-            f"inst_n must be a multiple of 8, got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be a multiple of 8, got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # Validate inst_n: Hopper WGMMA supports n in [8, 256] and multiple of 8
         assert 8 <= inst_n <= 256, (
-            f"inst_n must be within [8, 256], got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be within [8, 256], got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # 256 bits per instruction
         inst_k = 256 // DataType(self.a_dtype).bits
         self.wgmma_inst_m = inst_m
@@ -160,13 +175,9 @@ def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMo
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def wgmma(self,
-              A_region: BufferRegion,
-              B_region: BufferRegion,
-              C_region: BufferRegion,
-              clear_accum: PrimExpr = False,
-              wg_wait: int = 0):
-
+    def wgmma(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         if is_fragment(A_region):
             return self.wgmma_rs(A_region, B_region, C_region, clear_accum, wg_wait)
 
@@ -195,16 +206,13 @@ def wgmma(self,
         elems_in_bytes = elems_in_bits // 8
 
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_bits = DataType(accum_dtype).bits
         accum_regs = ((m_dim // 64) * warp_cols * local_size_out * accum_bits + 31) // 32
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -220,19 +228,15 @@ def wgmma(self,
                 if a_m_axis_atoms <= 1:
                     a_leading_byte_offset = 0
                 else:
-                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (
-                        a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
 
                 if a_m_axis_atoms <= 1:
                     a_stride_byte_offset = 8 * elems_in_bytes * m_dim
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -275,12 +279,8 @@ def _warp_mma(A_ptr, B_ptr, C_buf):
 
             desc_a = T.alloc_wgmma_desc()
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode,
-                                          int(a_leading_byte_offset >> 4),
-                                          int(a_stride_byte_offset >> 4))
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode, int(a_leading_byte_offset >> 4), int(a_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
 
@@ -291,21 +291,41 @@ def _warp_mma(A_ptr, B_ptr, C_buf):
                         warp_i = (warp_m // 4) * num_inst_m + i
                         warp_j = warp_n * num_inst_n + j
                         A_offset = (
-                            ki % ak_atom_size
-                        ) * micro_size_k + warp_i * 64 * a_swizzle_atom_elems + (
-                            ki // ak_atom_size
-                        ) * m_dim * a_swizzle_atom_elems if a_is_k_major else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
-                        B_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                            ki % bk_atom_size
-                        ) * micro_size_k + warp_j * wgmma_inst_n * b_swizzle_atom_elems if b_is_k_major else (
-                            ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                            (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki % ak_atom_size) * micro_size_k
+                            + warp_i * 64 * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+                        B_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
-                        T.ptx_wgmma_ss(accum_dtype, wgmma_prefix, a_is_k_major, b_is_k_major,
-                                       a_dtype_abbrv, b_dtype_abbrv, accum_dtype_abbrv, desc_a.data,
-                                       (A_offset * elems_in_bytes) >> 4, desc_b.data,
-                                       (B_offset * elems_in_bytes) >> 4, C_buf.data, C_offset,
-                                       scale_out, scale_in_a, scale_in_b)
+                        T.ptx_wgmma_ss(
+                            accum_dtype,
+                            wgmma_prefix,
+                            a_is_k_major,
+                            b_is_k_major,
+                            a_dtype_abbrv,
+                            b_dtype_abbrv,
+                            accum_dtype_abbrv,
+                            desc_a.data,
+                            (A_offset * elems_in_bytes) >> 4,
+                            desc_b.data,
+                            (B_offset * elems_in_bytes) >> 4,
+                            C_buf.data,
+                            C_offset,
+                            scale_out,
+                            scale_in_a,
+                            scale_in_b,
+                        )
 
             T.warpgroup_commit_batch()
             if wg_wait >= 0:
@@ -314,12 +334,9 @@ def _warp_mma(A_ptr, B_ptr, C_buf):
 
         return _warp_mma(A_ptr, B_ptr, C_buf)
 
-    def wgmma_rs(self,
-                 A_region: BufferRegion,
-                 B_region: BufferRegion,
-                 C_region: BufferRegion,
-                 clear_accum: PrimExpr = False,
-                 wg_wait: int = 0):
+    def wgmma_rs(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         local_size_a = self.local_size_a
         local_size_out = self.local_size_out
         a_dtype_abbrv = self.a_dtype_abbrv
@@ -344,14 +361,10 @@ def wgmma_rs(self,
         b_is_k_major = self.b_transposed
 
         b_swizzle_mode = self._determinate_swizzle_mode(B_region, self.b_shared_layout)
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -390,9 +403,7 @@ def _warp_mma(A_buf, B_ptr, C_buf):
             tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
 
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(A_buf, num_regs=a_regs)
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
@@ -405,11 +416,15 @@ def _warp_mma(A_buf, B_ptr, C_buf):
 
                         A_offset = ki * warp_rows * local_size_a + i * local_size_a
                         B_offset = (
-                            ki // bk_atom_size
-                        ) * n_dim * b_swizzle_atom_elems + warp_j * wgmma_inst_n * b_swizzle_atom_elems + (
-                            ki % bk_atom_size) * micro_size_k if b_is_k_major else (
-                                ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                                (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
                         T.ptx_wgmma_rs(
                             accum_dtype,
@@ -460,6 +475,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A"], "matrix should be A for WGMMA"
         dtype = self.a_dtype
         dtype_bits = DataType(dtype).bits
@@ -488,8 +504,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
         # the layout of mma.sync is row.col.
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
 
         assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
 
@@ -500,7 +515,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -531,20 +546,12 @@ def forward_index(i: int, j: int) -> int:
         replicate = block_col_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([block_s, 1],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_s, warp_r],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
         else:
             # rs condition, transposed_a matrix
-            warp_fragment = base_fragment.repeat([1, block_s],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_r, warp_s],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
 
         return block_fragment
 
diff --git a/tilelang/ir.py b/tilelang/ir.py
index cccf97e0a..5afe7d04c 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -4,26 +4,23 @@
 import tvm_ffi
 from tvm.target import Target
 from tilelang import _ffi_api
+from tilelang.tileop.gemm.inst import GemmInst
 
 
 @tvm_ffi.register_object("tl.Fill")
-class Fill(Node, Scriptable):
-    ...
+class Fill(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.AtomicAdd")
-class AtomicAdd(Node, Scriptable):
-    ...
+class AtomicAdd(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Copy")
-class Copy(Node, Scriptable):
-    ...
+class Copy(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Conv2DIm2Col")
-class Conv2DIm2ColOp(Node, Scriptable):
-    ...
+class Conv2DIm2ColOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmWarpPolicy")
@@ -32,48 +29,49 @@ class GemmWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
-                               is_wgmma: bool):
-        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
-                                                    is_wgmma)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, gemm_inst: GemmInst):
+        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, gemm_inst)
+        return self.m_warp, self.n_warp
+
+
+@tvm_ffi.register_object("tl.GemmSPWarpPolicy")
+class GemmSPWarpPolicy(Node, Scriptable):
+    policy_type: int
+    m_warp: int
+    n_warp: int
+
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, gemm_inst: GemmInst, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, gemm_inst, bits)
         return self.m_warp, self.n_warp
 
 
 @tvm_ffi.register_object("tl.Gemm")
-class Gemm(Node, Scriptable):
-    ...
+class Gemm(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmSP")
-class GemmSP(Node, Scriptable):
-    ...
+class GemmSP(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.FinalizeReducerOp")
-class FinalizeReducerOp(Node, Scriptable):
-    ...
+class FinalizeReducerOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ParallelOp")
-class ParallelOp(Node, Scriptable):
-    ...
+class ParallelOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceOp")
-class ReduceOp(Node, Scriptable):
-    ...
+class ReduceOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.CumSumOp")
-class CumSumOp(Node, Scriptable):
-    ...
+class CumSumOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.RegionOp")
-class RegionOp(Node, Scriptable):
-    ...
+class RegionOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceType")
-class ReduceType(Node, Scriptable):
-    ...
+class ReduceType(Node, Scriptable): ...
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 24378ac8a..660d832fa 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -3,6 +3,7 @@
 It includes functionality to JIT-compile TileLang programs into a runnable
 kernel adapter using TVM.
 """
+
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -16,18 +17,17 @@
     Literal,
 )
 from collections.abc import Iterable
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
-from tilelang.language.v2 import PrimFunc
-from tilelang.jit.adapter.utils import is_metal_target
+from tilelang.language.eager import PrimFunc, prim_func, JITFunc
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
-from tilelang.utils.target import determine_target
 from tilelang.cache import cached
 from os import path, makedirs
 from logging import getLogger
@@ -38,50 +38,62 @@
 
 logger = getLogger(__name__)
 
-_P = ParamSpec('_P')
-_KP = ParamSpec('_KP')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_KP = ParamSpec("_KP")
+_T = TypeVar("_T")
+_Ret = TypeVar("_Ret")
 
 
 def compile(
     func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-    target: str | Target = "auto",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    target: str | Target | None = None,
     target_host: str | Target | None = None,
-    verbose: bool = False,
+    verbose: bool | None = None,
     pass_configs: dict[str, Any] | None = None,
     compile_flags: list[str] | str | None = None,
 ) -> JITKernel[_KP, _T]:
     """
     Compile the given TileLang PrimFunc with TVM and build a JITKernel.
+
     Parameters
     ----------
     func : tvm.tir.PrimFunc, optional
         The TileLang TIR function to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. If None, reads from
+        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
     target : Union[str, Target], optional
-        Compilation target, either as a string or a TVM Target object (default: "auto").
+        Compilation target, either as a string or a TVM Target object. If None, reads from
+        TILELANG_TARGET environment variable (defaults to "auto").
     target_host : Union[str, Target], optional
         Target host for cross-compilation (default: None).
     verbose : bool, optional
-        Whether to enable verbose output (default: False).
+        Whether to enable verbose output. If None, reads from
+        TILELANG_VERBOSE environment variable (defaults to False).
     pass_configs : dict, optional
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
+
+    Environment Variables
+    ---------------------
+    TILELANG_TARGET : str
+        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+    TILELANG_EXECUTION_BACKEND : str
+        Default execution backend. Defaults to "auto".
+    TILELANG_VERBOSE : str
+        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
     """
-    assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
-    # This path is not a performance critical path, so we can afford to convert the target.
-    target = Target(determine_target(target))
+    assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
 
-    if is_metal_target(target):
-        assert execution_backend == 'torch', 'Currently metal target only support `tl.jit(execution_backend="torch")`'
+    if hasattr(func, "out_idx_override"):
+        if func.out_idx_override is not None and out_idx is not None:
+            raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
+        out_idx = func.out_idx_override or out_idx
 
     return cached(
         func=func,
@@ -95,37 +107,53 @@ def compile(
     )
 
 
-def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
-                out_idx: list[int] | int | None = None,
-                execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-                target: str | Target = "auto",
-                target_host: str | Target | None = None,
-                verbose: bool = False,
-                pass_configs: dict[str, Any] | None = None,
-                compile_flags: list[str] | str | None = None,
-                num_workers: int = None,
-                ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+def par_compile(
+    funcs: Iterable[PrimFunc[_KP, _T]],
+    out_idx: list[int] | int | None = None,
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    compile_flags: list[str] | str | None = None,
+    num_workers: int | None = None,
+    ignore_error: bool = False,
+) -> list[JITKernel[_KP, _T]]:
     """
     Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+
     Parameters
     ----------
     funcs : Iterable[tvm.tir.PrimFunc]
         The TileLang TIR functions to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. If None, reads from
+        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
     target : Union[str, Target], optional
-        Compilation target, either as a string or a TVM Target object (default: "auto").
+        Compilation target, either as a string or a TVM Target object. If None, reads from
+        TILELANG_TARGET environment variable (defaults to "auto").
     target_host : Union[str, Target], optional
         Target host for cross-compilation (default: None).
     verbose : bool, optional
-        Whether to enable verbose output (default: False).
+        Whether to enable verbose output. If None, reads from
+        TILELANG_VERBOSE environment variable (defaults to False).
     pass_configs : dict, optional
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
+
+    Environment Variables
+    ---------------------
+    TILELANG_TARGET : str
+        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+    TILELANG_EXECUTION_BACKEND : str
+        Default execution backend. Defaults to "auto".
+    TILELANG_VERBOSE : str
+        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
     """
-    with concurrent.futures.ThreadPoolExecutor(num_workers, 'tl-par-comp') as executor:
+
+    with concurrent.futures.ThreadPoolExecutor(num_workers, "tl-par-comp") as executor:
         futures = []
         future_map = {}
         for i, func in enumerate(funcs):
@@ -144,9 +172,9 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
             futures.append(future)
         results = [... for _ in futures]
         for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Parallel Compiling",
+            concurrent.futures.as_completed(futures),
+            total=len(futures),
+            desc="Parallel Compiling",
         ):
             idx = future_map[future]
             if ignore_error:
@@ -162,18 +190,92 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
 
 
 @dataclass
-class JITImpl(Generic[_P, _KP, _T]):
-    func: Callable[_P, _T] | PrimFunc[_KP, _T]
+class JITImpl(Generic[_P, _KP, _T, _Ret]):
+    """
+    Just-In-Time compilation wrapper for TileLang programs.
+
+    This class provides a unified interface for compiling and executing TileLang
+    kernels. It supports two execution modes that are automatically inferred:
+
+    Execution Modes
+    ---------------
+    - **lazy**: The decorated function returns a PrimFunc explicitly. Calling the
+      JIT wrapper returns a compiled kernel object, which can be invoked separately.
+      This mode is useful when you want to inspect or reuse the kernel object.
+
+      Example (lazy mode)::
+
+          @tilelang.jit(out_idx=[-1])
+          def matmul(M, N, K, block_M, block_N, block_K):
+              @T.prim_func
+              def kernel(A: T.Tensor((M, K), dtype), ...):
+                  ...
+              return kernel  # explicitly return PrimFunc
+
+          kernel = matmul(1024, 1024, 1024, 128, 128, 32)  # returns kernel
+          result = kernel(a, b)  # execute separately
+
+    - **eager**: The decorated function uses the DSL builder pattern with tensor
+      type annotations. Calling the JIT wrapper compiles and immediately executes
+      the kernel, returning the result directly.
+
+      Example (eager mode)::
+
+          @tilelang.jit
+          def gemm(A, B, C, block_M: int = 64):
+              M, N, K = T.const("M N K")
+              A: T.Tensor[[M, K], dtype]  # tensor shape via annotation
+              B: T.Tensor[[K, N], dtype]
+              C: T.Tensor[[M, N], dtype]
+              with T.Kernel(...):
+                  ...
+
+          gemm(A, B, C)  # compiles and executes immediately
+
+    The mode is automatically inferred based on whether the function returns a
+    PrimFunc (lazy) or uses the builder pattern (eager).
+
+    Attributes
+    ----------
+    out_idx : list[int] | int | None
+        Index(es) of output tensor(s) to return (lazy mode only).
+    execution_backend : str | None
+        Backend for kernel execution ("auto", "dlpack", "tvm_ffi", etc.).
+    target : str | Target | None
+        TVM compilation target (e.g., "cuda", "llvm", "auto").
+    target_host : str | Target | None
+        Host target for cross-compilation.
+    verbose : bool | None
+        Enable verbose compilation output.
+    pass_configs : dict[str, Any] | None
+        TVM pass configuration options.
+    debug_root_path : str | None
+        Directory to save compiled kernel source for debugging.
+    compile_flags : list[str] | str | None
+        Additional compiler flags.
+    func_source : str
+        Original Python source code of the decorated function.
+    signature : inspect.Signature
+        Function signature of the original function.
+    mode : Literal["auto", "lazy", "eager"]
+        Execution mode. "auto" infers from function behavior.
+    func : JITFunc
+        The wrapped function object.
+    """
+
     out_idx: list[int] | int | None
-    execution_backend: Literal["dlpack", "ctypes", "cython"]
-    target: str | Target
-    target_host: str | Target
-    verbose: bool
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None
+    target: str | Target | None
+    target_host: str | Target | None
+    verbose: bool | None
     pass_configs: dict[str, Any] | None
     debug_root_path: str | None
     compile_flags: list[str] | str | None
     func_source: str
     signature: inspect.Signature
+    mode: Literal["auto", "lazy", "eager"]
+    # place func at the last element for better __repr__
+    func: JITFunc[_KP, _T]
 
     def __post_init__(self):
         if self.debug_root_path is not None and not path.isabs(self.debug_root_path):
@@ -183,24 +285,72 @@ def __post_init__(self):
             except NameError:
                 self.debug_root_path = path.abspath(self.debug_root_path)
         self._kernel_cache: dict[tuple, Kernel] = {}
+        self._tuner_cache: dict[tuple, Kernel] = {}
 
     def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
-        program_result_source = self.func
-        if isinstance(program_result_source, PrimFunc):
-            program_result = program_result_source
-        elif callable(program_result_source):
-            program_result = program_result_source(*args, **kwargs)
+        """
+        Retrieve a TIR (Tensor Intermediate Representation) PrimFunc from the stored callable or object.
+        """
+        self.initialize_jit_mode(*args, **kwargs)
+        if isinstance(self.func, PrimFunc):
+            tir = self.func
+        elif callable(self.func):
+            tir = self.func(*args, **kwargs)
         else:
-            raise ValueError(f"Invalid function type: {type(program_result_source)}")
-        return program_result
+            raise ValueError(f"Invalid function type: {type(self.func)}")
+        assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
+        return tir
+
+    def _infer_jit_mode(self, *args: _P.args, **kwargs: _P.kwargs) -> Literal["lazy", "eager"]:
+        """
+        Infer the JIT execution mode based on function behavior.
+
+        Returns "lazy" if the function explicitly returns a PrimFunc,
+        or "eager" if it uses the DSL builder pattern.
+        """
+        if self.mode in ("lazy", "eager"):
+            return self.mode
+        # auto: infer by checking if function returns PrimFunc directly
+        if not isinstance(self.func, JITFunc):
+            return "lazy"
+        is_lazy_style = self.func._is_lazy_style(*args, **kwargs)
+        return "lazy" if is_lazy_style else "eager"
+
+    def initialize_jit_mode(self, *args: _P.args, **kwargs: _P.kwargs) -> Literal["lazy", "eager"]:
+        if self.mode == "auto":
+            self.mode = self._infer_jit_mode(*args, **kwargs)
+        self.func.set_mode(self.mode)
+        if self.mode == "eager" and self.out_idx is not None:
+            raise ValueError("out_idx is only supported in lazy mode. In eager mode, use T.empty() to declare output tensors instead.")
+        return self.mode
+
+    def par_compile(
+        self, configs: Iterable[dict[str, Any] | tuple[str, Any]], num_workers: int = None, ignore_error: bool = False
+    ) -> list[JITKernel[_KP, _T]]:
+        """
+        Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+        Parameters
+        ----------
+        configs : Iterable[Union[dict[str, Any], tuple[Any, ...]]]
+            The configurations to elaborate and compile. Each config can be either
+            a dictionary mapping keyword arguments to values, or a tuple of positional
+            arguments.
+        num_workers : int, optional
+            Number of parallel workers to use for compilation. Defaults to None,
+            which lets the system decide.
+        ignore_error : bool, optional
+            If True, compilation errors for individual configs will be logged
+            as warnings and the corresponding result will be None. If False,
+            any compilation error will raise an exception. Defaults to False.
+        Returns
+        -------
+        List[JITKernel]
+            A list of compiled JITKernel objects corresponding to the provided configs.
+        """
 
-    def par_compile(self,
-                    configs: Iterable[dict[str, Any] | tuple[str, Any]],
-                    num_workers: int = None,
-                    ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
         configs = list(configs)
         funcs = []
-        for cfg in tqdm(configs, desc='Elaborating'):
+        for cfg in tqdm(configs, desc="Elaborating"):
             if isinstance(cfg, tuple):
                 funcs.append(self.get_tir(*cfg))
             elif isinstance(cfg, dict):
@@ -217,12 +367,13 @@ def par_compile(self,
             pass_configs=self.pass_configs,
             compile_flags=self.compile_flags,
             num_workers=num_workers,
-            ignore_error=ignore_error)
+            ignore_error=ignore_error,
+        )
 
-    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
-        func = self.get_tir(*args, **kwargs)
+    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
+        prim_func = self.get_tir(*args, **kwargs)
         kernel_result = compile(
-            func,
+            prim_func,
             out_idx=self.out_idx,
             execution_backend=self.execution_backend,
             target=self.target,
@@ -234,132 +385,152 @@ def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
 
         if self.debug_root_path:
             if isinstance(self.func, PrimFunc):
-                func_name = self.func.attrs['global_symbol']
+                func_name = self.func.attrs["global_symbol"]
             else:
-                func_name = getattr(self.func, '__name__', 'jit_kernel')
-            kernel_file = f'tilelang_jit_kernel_{func_name}.c'
-            program_file = f'tilelang_jit_program_{func_name}.py'
+                func_name = getattr(self.func, "__name__", "jit_kernel")
+            kernel_file = f"tilelang_jit_kernel_{func_name}.c"
+            program_file = f"tilelang_jit_program_{func_name}.py"
             makedirs(self.debug_root_path, exist_ok=True)
-            with open(path.join(self.debug_root_path, kernel_file), 'w') as f:
+            with open(path.join(self.debug_root_path, kernel_file), "w") as f:
                 print(kernel_result.get_kernel_source(), file=f)
-            with open(path.join(self.debug_root_path, program_file), 'w') as f:
-                print(func.script(), file=f)
+            with open(path.join(self.debug_root_path, program_file), "w") as f:
+                print(prim_func.script(), file=f)
 
         return kernel_result
 
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
+    def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
+        tune_params = kwargs.pop("__tune_params", {})
+        key_args_tuple = args
+        key_kwargs_tuple = tuple(sorted(kwargs.items()))
+        tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
+        key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+        return key
+
+    def get_kernel_source(self, *args: _P.args, **kwargs: _P.kwargs) -> str:
+        kernel = self.compile(*args, **kwargs)
+        return kernel.get_kernel_source()
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         # Separate out the tuning parameters from the user's kwargs
-        tune_params = kwargs.pop('__tune_params', {})
         # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
-        return_compile_arguments = kwargs.pop('__return_compile_arguments', False)
+        return_compile_arguments = kwargs.pop("__return_compile_arguments", False)
         if return_compile_arguments:
+            logger.warning("`__return_compile_arguments` is deprecated and will be removed in future versions.")
             compile_args = {
-                'out_idx': self.out_idx,
-                'execution_backend': self.execution_backend,
-                'target': self.target,
-                'target_host': self.target_host,
-                'verbose': self.verbose,
-                'pass_configs': self.pass_configs,
-                'compile_flags': self.compile_flags,
+                "out_idx": self.out_idx,
+                "execution_backend": self.execution_backend,
+                "target": self.target,
+                "target_host": self.target_host,
+                "verbose": self.verbose,
+                "pass_configs": self.pass_configs,
+                "compile_flags": self.compile_flags,
             }
             return compile_args
 
-        key_args_tuple = args
-        key_kwargs_tuple = tuple(sorted(kwargs.items()))
-        tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
-        key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+        kwargs.update(kwargs.pop("__tune_params", {}))
 
-        if key not in self._kernel_cache:
-            self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+        # infer mode early, before parse_args needs it
+        if self.mode == "auto":
+            self.mode = self._infer_jit_mode(*args, **kwargs)
+            self.func.set_mode(self.mode)
 
-        return self._kernel_cache[key]
+        key, kernel_args = self.func.parse_args(*args, **kwargs)
+        kernel = self._kernel_cache.get(key, None)
+        if kernel is None:
+            kernel = self.compile(*args, **kwargs)
+            self._kernel_cache[key] = kernel
+
+        # eager mode: execute kernel immediately and return result
+        # lazy mode: return kernel object for manual invocation
+        if self.mode == "eager":
+            return kernel(*kernel_args.values())
+        else:
+            return kernel
+
+
+ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"]
 
 
 @overload
-def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T]:
-    ...
+def jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
 
 
 @overload
 def jit(
-    *,  # Indicates subsequent arguments are keyword-only
+    *,
     out_idx: Any = None,
-    target: str | Target = "auto",
-    target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-    verbose: bool = False,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
+
+
+def jit(
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    out_idx: list[int] | int | None = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None
-) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T]]:
-    ...
-
-
-def jit(  # This is the new public interface
-        func: Callable[_P, _T] | PrimFunc | None = None,
-        *,  # Indicates subsequent arguments are keyword-only
-        out_idx: Any = None,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        verbose: bool = False,
-        pass_configs: dict[str, Any] | None = None,
-        debug_root_path: str | None = None,
-        compile_flags: list[str] | str | None = None):
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_P, _T]], JITImpl[_KP, _KP, _T, _T]]:
     """
-    Just-In-Time (JIT) compiler decorator for TileLang functions.
+    JIT compiler decorator for TileLang functions.
 
-    This decorator can be used without arguments (e.g., `@tilelang.jit`):
-       Applies JIT compilation with default settings.
+    Supports two execution modes (automatically inferred):
+    - **lazy**: Function returns PrimFunc explicitly. Returns compiled kernel object.
+    - **eager**: Function uses DSL builder pattern. Executes kernel immediately.
 
     Parameters
     ----------
-    func_or_out_idx : Any, optional
-        If using `@tilelang.jit(...)` to configure, this is the `out_idx` parameter.
-        If using `@tilelang.jit` directly on a function, this argument is implicitly
-        the function to be decorated (and `out_idx` will be `None`).
-    target : Union[str, Target], optional
-        Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
-    target_host : Union[str, Target], optional
-        Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
-    verbose : bool, optional
-        Enables verbose logging during compilation. Defaults to False.
-    pass_configs : Optional[Dict[str, Any]], optional
-        Configurations for TVM's pass context. Defaults to None.
-    debug_root_path : Optional[str], optional
-        Directory to save compiled kernel source for debugging. Defaults to None.
-
-    Returns
-    -------
-    Callable
-        Either a JIT-compiled wrapper around the input function, or a configured decorator
-        instance that can then be applied to a function.
+    out_idx : list[int] | int | None
+        Output tensor index(es). Only supported in lazy mode.
+    target : str | Target | None
+        TVM compilation target (e.g., "cuda", "llvm", "auto").
+    target_host : str | Target | None
+        Host target for cross-compilation.
+    execution_backend : ExecutionBackend | None
+        Backend for kernel execution.
+    verbose : bool | None
+        Enable verbose compilation output.
+    pass_configs : dict[str, Any] | None
+        TVM pass configuration options.
+    debug_root_path : str | None
+        Directory to save compiled kernel source for debugging.
+    compile_flags : list[str] | str | None
+        Additional compiler flags.
     """
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
-    def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
-        if isinstance(func, PrimFunc):
-            orig_func = func.orig_func
-        else:
-            orig_func = func
+    compile_args = dict(
+        out_idx=out_idx,
+        execution_backend=execution_backend,
+        target=target,
+        target_host=target_host,
+        verbose=verbose,
+        pass_configs=pass_configs,
+        debug_root_path=debug_root_path,
+        compile_flags=compile_flags,
+    )
+
+    def decorator(func: Callable[_P, _T]):
+        mode = "auto"
+        pf: JITFunc[_P, _T] = prim_func(func, eager_jit=True)
+        func_source = inspect.getsource(pf.orig_func)
+        signature = inspect.signature(pf.orig_func)
+
         return JITImpl(
-            func,
-            out_idx=out_idx,
-            execution_backend=execution_backend,
-            target=target,
-            target_host=target_host,
-            verbose=verbose,
-            pass_configs=pass_configs,
-            debug_root_path=debug_root_path,
-            compile_flags=compile_flags,
-            func_source=inspect.getsource(orig_func),
-            signature=inspect.signature(orig_func),
+            func=pf,
+            **compile_args,
+            func_source=func_source,
+            signature=signature,
+            mode=mode,
         )
 
-    if func is not None:
-        return decorator(func)
-    else:
-        return decorator
+    return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py
index 0e8fb98c8..0d9945285 100644
--- a/tilelang/jit/adapter/__init__.py
+++ b/tilelang/jit/adapter/__init__.py
@@ -1,6 +1,6 @@
 from .base import BaseKernelAdapter  # noqa: F401
-from .dlpack import TorchDLPackKernelAdapter  # noqa: F401
-from .ctypes import CtypesKernelAdapter  # noqa: F401
+from .tvm_ffi import TVMFFIKernelAdapter  # noqa: F401
 from .cython import CythonKernelAdapter  # noqa: F401
 from .nvrtc import NVRTCKernelAdapter  # noqa: F401
 from .torch import MetalKernelAdapter  # noqa: F401
+from .cutedsl import CuTeDSLKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/base.py b/tilelang/jit/adapter/base.py
index 9d998bc96..3669f9e35 100644
--- a/tilelang/jit/adapter/base.py
+++ b/tilelang/jit/adapter/base.py
@@ -1,13 +1,14 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable
 from tilelang.engine.param import KernelParam
+import torch
 
 
 class BaseKernelAdapter(ABC):
-
     func: Callable | None = None
 
     def __init__(self, mod, params: list[KernelParam], result_idx: list[int]) -> None:
@@ -23,18 +24,14 @@ def _legalize_result_idx(self, result_idx: list[int] | None) -> list[int]:
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                )
+                raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
         elif isinstance(result_idx, list):
             for i, idx in enumerate(result_idx):
                 if idx >= len(params) or idx < -len(params):
-                    raise ValueError(
-                        f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                    )
+                    raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
                 if idx < 0:
                     result_idx[i] = len(params) + idx
         else:
@@ -46,11 +43,54 @@ def _legalize_result_idx(self, result_idx: list[int] | None) -> list[int]:
     def _convert_torch_func(self) -> callable:
         pass
 
+    # --- Common helpers to align with PyTorch stream/device semantics ---
+    @staticmethod
+    def get_current_stream_functor() -> Callable[[], int]:
+        """Return a callable that reads Torch's current CUDA stream pointer.
+
+        The returned lambda yields the raw CUDA stream handle of the current
+        PyTorch stream on the active device. It's a thunk (evaluated at call
+        time) so that any upstream stream guards are respected. If CUDA is
+        unavailable, it returns a lambda that yields 0.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                get_stream = torch._C._cuda_getCurrentRawStream
+                return lambda: get_stream(current_device())
+            except Exception:
+                # Fallback to Python API if internal handles are unavailable
+                return lambda: int(torch.cuda.current_stream().cuda_stream)
+        # CPU or CUDA unavailable: no stream semantics
+        return lambda: 0
+
+    @staticmethod
+    def get_current_device_functor() -> Callable[[], torch.device]:
+        """Return a callable that yields Torch's current device.
+
+        Similar to the stream functor, we capture a callable that, when called,
+        fetches the current device according to PyTorch. On CPU or when CUDA is
+        unavailable, returns ``torch.device('cpu')``.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                return lambda: torch.device("cuda", current_device())
+            except Exception:
+                return lambda: torch.device("cuda", torch.cuda.current_device())
+        # CPU fallback
+        return lambda: torch.device("cpu")
+
     def __call__(self, *args: Any, **kwds: Any) -> Any:
         return self.func(*args, **kwds)
 
-    def get_kernel_source(self) -> str:
-        return self.mod.imported_modules[0].get_source()
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
+        if kernel_only:
+            return self.mod.imports[0].inspect_source()
+        else:
+            return self.mod.inspect_source() + "\n\n" + self.mod.imports[0].inspect_source()
 
     def _post_init(self):
         self.func = self._convert_torch_func()
diff --git a/tilelang/jit/adapter/ctypes/__init__.py b/tilelang/jit/adapter/ctypes/__init__.py
deleted file mode 100644
index 5e6fdc84d..000000000
--- a/tilelang/jit/adapter/ctypes/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .adapter import CtypesKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
deleted file mode 100644
index 648c66c1c..000000000
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ /dev/null
@@ -1,295 +0,0 @@
-"""The profiler and convert to torch utils"""
-from __future__ import annotations
-
-import torch
-from ..base import BaseKernelAdapter
-import ctypes
-from typing import Callable, Any
-from tilelang import tvm as tvm
-from tvm.target import Target
-from tvm.relax import TensorType
-from tvm import tir
-from tilelang.jit.adapter.wrapper import TLWrapper
-from tilelang.jit.adapter.libgen import LibraryGenerator
-from tilelang.utils.target import determine_target
-from tilelang.utils.language import retrieve_func_from_module
-
-
-class CtypesKernelAdapter(BaseKernelAdapter):
-    """Adapter class that converts TVM/TIR functions to callable CUDA kernels using ctypes.
-
-    This adapter handles:
-    1. Converting TIR functions to compiled CUDA libraries
-    2. Managing dynamic shapes in tensor operations
-    3. Wrapping C++ kernels for Python/PyTorch usage
-    """
-
-    # Class attributes to store compiled kernel information
-    target = "cuda"
-    ir_module: tvm.IRModule | None = None
-    # The global source code of the kernel -> global means the source code of the kernel
-    # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
-    lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
-    # Maps symbolic variables to their corresponding buffer and shape indices
-    dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
-    # Pass configs for the compiler
-    pass_configs: dict[str, Any] | None = None
-
-    # Add new cache attributes
-    param_dtypes: list[torch.dtype] | None = None  # Cache for parameter dtypes
-    param_shapes: list[list] | None = None  # Cache for parameter shapes
-
-    def __init__(self,
-                 params: list[TensorType],
-                 result_idx: list[int],
-                 target: str,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
-        """Initialize the adapter with the given TIR function or module.
-
-        Args:
-            params: List of tensor types for inputs/outputs
-            result_idx: Indices of output tensors
-            target: Target platform (e.g., 'cuda')
-            func_or_mod: TIR function or module to be compiled
-            verbose: Enable verbose logging
-        """
-        self.params = params
-        self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
-
-        if isinstance(func_or_mod, tir.PrimFunc):
-            self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
-        else:
-            self.ir_module = func_or_mod
-
-        # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
-        self.param_shapes = []
-        for param in params:
-            native_shape = []
-            for dim in param.shape:
-                if isinstance(dim, tir.IntImm):
-                    native_shape.append(int(dim))
-                elif isinstance(dim, tir.Var):
-                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
-                else:
-                    native_shape.append(dim)
-            self.param_shapes.append(native_shape)
-
-        self.dynamic_symbolic_map = self._process_dynamic_symbolic()
-
-        self.target = Target.canon_target(determine_target(target))
-        self.verbose = verbose
-        self.wrapper = TLWrapper(self.target)
-        self.lib_generator = LibraryGenerator(self.target, verbose=verbose)
-        self.lib_generator.assign_pass_configs(pass_configs)
-        self.lib_generator.assign_compile_flags(compile_flags)
-
-        self.wrapper.assign_optimized_module(self.ir_module)
-        self.wrapper.assign_pass_configs(pass_configs)
-        self.wrapper.assign_host_module(host_mod)
-        self.wrapper.assign_device_module(device_mod)
-        self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
-
-        self.lib_generator.update_lib_code(self.wrapped_source)
-        self.lib_generator.compile_lib()
-        self.lib = self.lib_generator.load_lib()
-        self.lib.init()
-
-        self._post_init()
-
-    @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
-        adapter = cls.__new__(cls)
-        adapter.params = params
-        adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
-        adapter.pass_configs = pass_configs
-
-        if isinstance(func_or_mod, tir.PrimFunc):
-            adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
-        else:
-            adapter.ir_module = func_or_mod
-
-        # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
-        adapter.param_shapes = []
-        for param in params:
-            native_shape = []
-            for dim in param.shape:
-                if isinstance(dim, tir.IntImm):
-                    native_shape.append(int(dim))
-                elif isinstance(dim, tir.Var):
-                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
-                else:
-                    native_shape.append(dim)
-            adapter.param_shapes.append(native_shape)
-
-        adapter.dynamic_symbolic_map = adapter._process_dynamic_symbolic()
-
-        adapter.target = Target.canon_target(determine_target(target))
-        adapter.verbose = verbose
-        adapter.lib_generator = LibraryGenerator(adapter.target, verbose=verbose)
-        adapter.lib_generator.assign_pass_configs(pass_configs)
-        adapter.lib_generator.assign_compile_flags(compile_flags)
-        adapter.lib = adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
-        adapter.lib.init()
-
-        adapter._post_init()
-        return adapter
-
-    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
-        """Extract information about dynamic shapes from the TIR function.
-
-        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
-        for runtime shape resolution.
-        id represents shape or stride, 0 represents shape, 1 represents stride
-        """
-        func = self.prim_func
-        params = func.params
-        buffer_map = func.buffer_map
-        dynamic_symbolic_map = {}
-        for i, param in enumerate(params):
-            if param in buffer_map:
-                buffer = buffer_map[param]
-                for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
-                        dynamic_symbolic_map[shape] = (0, i, j)
-        for i, param in enumerate(params):
-            if param in buffer_map:
-                buffer = buffer_map[param]
-                for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
-                        dynamic_symbolic_map[stride] = (1, i, j)
-        return dynamic_symbolic_map
-
-    def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
-        """Low-level function to call the compiled CUDA kernel.
-
-        Converts PyTorch tensor pointers to C void pointers for ctypes interface.
-        """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
-        ctypes_args.append(ctypes.c_void_p(stream))
-        self.lib.call(*ctypes_args)
-
-    def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None):
-        """High-level wrapper for kernel execution.
-
-        Handles:
-        1. Input validation
-        2. Output tensor allocation
-        3. Dynamic shape resolution
-        4. CUDA stream management
-
-        Args:
-            ins: Input PyTorch tensors
-            stream: Optional CUDA stream for asynchronous execution
-
-        Returns:
-            Single tensor or list of tensors containing the kernel results
-        """
-        if len(ins) + len(self.result_idx) != len(self.params):
-            raise ValueError(
-                f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
-            )
-        ins_idx = 0
-        args = []
-
-        # tensor pointers
-        for i in range(len(self.params)):
-            if i in self.result_idx:
-                dtype = self.param_dtypes[i]
-                shape = []
-                # Now working with native Python list, no FFI calls needed
-                for s in self.param_shapes[i]:
-                    if isinstance(s, tir.Var):
-                        ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[s]
-                        shape.append(ins[ref_tensor_idx].shape[ref_shape_idx])
-                    else:  # Already converted to Python int during initialization
-                        shape.append(s)
-                device = ins[0].device if len(ins) > 0 else torch.cuda.current_device()
-                tensor = torch.empty(*shape, dtype=dtype, device=device)
-            else:
-                tensor = ins[ins_idx]
-                ins_idx += 1
-            args.append(tensor)
-
-        # dynamic symbolics
-        for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
-            if ref_id == 0:
-                args.append(ins[buffer_idx].shape[shape_idx])
-            else:
-                args.append(ins[buffer_idx].stride(shape_idx))
-
-        # if stream is not None, we need to pass the stream to the library
-        if stream is None:
-            if str(self.target).startswith("cuda") and torch.cuda.is_available():
-                stream = torch.cuda.current_stream().cuda_stream
-            else:
-                stream = 0
-
-        self._forward_from_prebuild_lib(*args, stream=stream)
-
-        if len(self.result_idx) == 1:
-            return args[self.result_idx[0]]
-        else:
-            return [args[i] for i in self.result_idx]
-
-    def _convert_torch_func(self) -> Callable:
-        """Returns a PyTorch-compatible function wrapper for the kernel."""
-        return self._wrap_forward_from_prebuild_lib
-
-    @property
-    def prim_func(self) -> tir.PrimFunc:
-        """Returns the primary TIR function from the IR module."""
-        return retrieve_func_from_module(self.ir_module)
-
-    @property
-    def srcpath(self):
-        """Returns the source path of the compiled library."""
-        return self.lib_generator.srcpath
-
-    @property
-    def libpath(self):
-        """Returns the path to the compiled library."""
-        return self.lib_generator.libpath
-
-    @property
-    def lib_code(self):
-        """Returns the code of the compiled library."""
-        return self.lib_generator.lib_code
-
-    @property
-    def is_dynamic(self):
-        """Indicates whether the kernel handles dynamic shapes."""
-        return (self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0)
-
-    def get_kernel_source(self, kernel_only: bool = False):
-        """Returns the source code of the compiled kernel."""
-        if kernel_only:
-            return self.kernel_global_source
-        else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
diff --git a/tilelang/jit/adapter/cutedsl/__init__.py b/tilelang/jit/adapter/cutedsl/__init__.py
new file mode 100644
index 000000000..e25899a1d
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/__init__.py
@@ -0,0 +1,16 @@
+"""CuTeDSL Backend for TileLang.
+
+This module provides runtime compilation support using NVIDIA's CuTeDSL API.
+"""
+
+__all__ = [
+    "CuTeDSLKernelAdapter",
+    "TLCuTeDSLSourceWrapper",
+    "CuTeDSLLibraryGenerator",
+    "check_cutedsl_available",
+]
+
+from .checks import check_cutedsl_available  # noqa: F401
+from .adapter import CuTeDSLKernelAdapter  # noqa: F401
+from .wrapper import TLCuTeDSLSourceWrapper  # noqa: F401
+from .libgen import CuTeDSLLibraryGenerator  # noqa: F401
diff --git a/tilelang/jit/adapter/cutedsl/adapter.py b/tilelang/jit/adapter/cutedsl/adapter.py
new file mode 100644
index 000000000..1f5d2f2fc
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/adapter.py
@@ -0,0 +1,411 @@
+from __future__ import annotations
+import logging
+import weakref
+from typing import Any, Callable
+
+import torch
+from tvm import tir
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.engine.param import KernelParam
+from tilelang.jit.adapter.wrapper import TLPyWrapper
+from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+from tilelang.jit.adapter.cutedsl.libgen import CuTeDSLLibraryGenerator
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class CuTeDSLKernelAdapter(BaseKernelAdapter):
+    pymodule = None
+
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        check_cutedsl_available()
+
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            self.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        self.param_dtypes = [param.torch_dtype() for param in params]
+        self.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            self.param_shapes.append(native_shape)
+
+        self.dynamic_symbolic_map, self.dynamic_symbolic_order = self._process_dynamic_symbolic()
+
+        self.target = Target.canon_target(determine_target(target))
+        self.verbose = verbose
+        self.wrapper = TLPyWrapper(self.target)
+        self.wrapper.assign_optimized_module(self.ir_module)
+        self.wrapper.assign_pass_configs(pass_configs)
+        self.wrapper.assign_host_module(host_mod)
+        self.wrapper.assign_device_module(device_mod)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
+        self.launcher_cpp_code = wrapper_result.get("launcher_cpp_code", None)
+        self.launcher_lib_name = wrapper_result.get("launcher_lib_name", None)
+
+        self.lib_generator = CuTeDSLLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
+        self.lib_generator.update_host_func(self.host_func)
+        self.lib_generator.update_launcher_cpp_code(self.launcher_cpp_code)
+        self.lib_generator.update_launcher_lib_name(self.launcher_lib_name)
+        self.lib_generator.assign_compile_flags(compile_flags)
+        self.lib_generator.compile_lib()
+        self.lib_generator.load_lib()
+        self.libpath = self.lib_generator.libpath
+        with open(self.libpath) as f:
+            self.device_kernel_source = f.read()
+        self.kernel_global_source = self.device_kernel_source
+        self.pymodule = self.lib_generator.pymodule
+
+        self._post_init()
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            adapter.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
+        adapter.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            adapter.param_shapes.append(native_shape)
+
+        adapter.dynamic_symbolic_map, adapter.dynamic_symbolic_order = adapter._process_dynamic_symbolic()
+
+        adapter.target = Target.canon_target(determine_target(target))
+        adapter.verbose = verbose
+        adapter.lib_generator = CuTeDSLLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator.assign_compile_flags(compile_flags)
+        adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = device_kernel_source
+        adapter.pymodule = adapter.lib_generator.pymodule
+
+        adapter._post_init()
+        return adapter
+
+    def _process_dynamic_symbolic(self) -> tuple[dict[tir.Var, tuple[int, int, int]], list[tir.Var]]:
+        """Extract information about dynamic symbols from the TIR function.
+
+        We follow the same ordering semantics as `TLCUDASourceWrapper.get_dynamic_symbolic_set()`:
+        1) dynamic symbols in buffer shapes (in prim_func param order)
+        2) then dynamic symbols in buffer strides
+
+        The mapping encodes:
+        - id=0: shape var -> (0, buffer_param_index, dim_index)
+        - id=1: stride var -> (1, buffer_param_index, stride_index)
+
+        Returns:
+            (dynamic_symbolic_map, dynamic_symbolic_order)
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] = {}
+        dynamic_symbolic_order: list[tir.Var] = []
+
+        def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
+            if v in dynamic_symbolic_map:
+                return
+            dynamic_symbolic_map[v] = entry
+            dynamic_symbolic_order.append(v)
+
+        # 1) Shapes
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            for j, shape in enumerate(buffer.shape):
+                if isinstance(shape, tir.Var):
+                    unique_push_back(shape, (0, i, j))
+
+        # 2) Strides
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            if buffer.strides is None:
+                continue
+            for j, stride in enumerate(buffer.strides):
+                if isinstance(stride, tir.Var):
+                    unique_push_back(stride, (1, i, j))
+
+        return dynamic_symbolic_map, dynamic_symbolic_order
+
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
+        """Get the CUDA kernel source code.
+
+        Returns
+        -------
+        str | None
+            The kernel source code, or None if not available
+        """
+        return self.device_kernel_source
+
+    def _forward_from_prebuild_lib(self, *args, stream: int | None = None, device_id: int = 0):
+        """Low-level function to call the compiled CUDA kernel.
+
+        Args:
+            *args: Kernel arguments (tensors and scalars)
+            stream: CUDA stream handle
+            device_id: CUDA device ID for multi-GPU support
+        """
+        result = self.pymodule.call(*args, stream=stream, device_id=device_id)
+
+        # After first call, save cubin to cache if needed
+        self._save_cubin_to_cache_if_needed()
+
+        return result
+
+    def _save_cubin_to_cache_if_needed(self):
+        """Save cubin to cache directory after first execution.
+
+        This is called after the first kernel execution to ensure the generated
+        cubin file is copied to the cache directory for future reuse.
+        """
+        if getattr(self, "_cubin_saved_to_cache", False):
+            return
+        self._cubin_saved_to_cache = True
+
+        # Check if we have a cache path (set by kernel_cache)
+        cache_path = getattr(self, "_cache_path", None)
+        if cache_path is None:
+            return
+
+        import os
+        import shutil
+
+        # Source cubin path (in temp directory)
+        src_py_path = self.libpath
+        src_py_stem = os.path.splitext(os.path.basename(src_py_path))[0]
+        src_dir = os.path.dirname(src_py_path)
+        src_cubin_path = os.path.join(src_dir, f"{src_py_stem}.cubin")
+
+        if not os.path.exists(src_cubin_path):
+            return
+
+        # Destination cubin path (in cache directory)
+        dst_cubin_path = os.path.join(cache_path, "kernel.cubin")
+
+        if os.path.exists(dst_cubin_path):
+            return
+
+        # Copy cubin to cache
+        try:
+            shutil.copy2(src_cubin_path, dst_cubin_path)
+            logger.debug(f"Saved CuTeDSL cubin to cache: {dst_cubin_path}")
+        except Exception as e:
+            logger.warning(f"Failed to save cubin to cache: {e}", exc_info=True)
+
+    def _wrap_forward_from_prebuild_lib(self, *ins: Any, stream: int | None = None):
+        """High-level wrapper for kernel execution.
+
+        Handles:
+        1. Input validation
+        2. Output tensor allocation
+        3. Dynamic shape resolution
+        4. CUDA stream management
+
+        Args:
+            ins: Input arguments (may include scalars and tensors)
+            stream: Optional CUDA stream for asynchronous execution
+
+        Returns:
+            Single tensor or list of tensors containing the kernel results
+        """
+        if len(ins) + len(self.result_idx) != len(self.params):
+            raise ValueError(
+                f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
+            )
+
+        # Materialize args in PrimFunc param order (inputs + allocated outputs)
+        ins_idx = 0
+        param_values: list[Any] = [None] * len(self.params)
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                continue
+            param_values[i] = ins[ins_idx]
+            ins_idx += 1
+
+        first_tensor = next((v for v in param_values if isinstance(v, torch.Tensor)), None)
+        if first_tensor is None:
+            raise ValueError("Expected at least one torch.Tensor argument to infer CUDA device")
+
+        args: list[Any] = []
+
+        # tensor pointers
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                dtype = self.param_dtypes[i]
+                shape = []
+                # Now working with native Python list, no FFI calls needed
+                for s in self.param_shapes[i]:
+                    if isinstance(s, tir.Var):
+                        ref_id, ref_param_idx, ref_dim_idx = self.dynamic_symbolic_map[s]
+                        ref_val = param_values[ref_param_idx]
+                        if not isinstance(ref_val, torch.Tensor):
+                            raise TypeError(f"Dynamic shape/stride var {s} refers to a non-tensor param at index {ref_param_idx}")
+                        if ref_id == 0:
+                            shape.append(ref_val.shape[ref_dim_idx])
+                        elif ref_id == 1:
+                            # Stride vars are not expected in output shapes, but handle defensively.
+                            shape.append(ref_val.stride()[ref_dim_idx])
+                        else:
+                            raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+                    else:  # Already converted to Python int during initialization
+                        shape.append(s)
+                tensor = torch.empty(*shape, dtype=dtype, device=first_tensor.device)
+                param_values[i] = tensor
+            else:
+                tensor = param_values[i]
+            args.append(tensor)
+
+        # dynamic symbolics
+        for sym in self.dynamic_symbolic_order:
+            ref_id, buffer_idx, dim_idx = self.dynamic_symbolic_map[sym]
+            ref_val = param_values[buffer_idx]
+            if not isinstance(ref_val, torch.Tensor):
+                raise TypeError(f"Dynamic symbolic var {sym} refers to a non-tensor param at index {buffer_idx}")
+            if ref_id == 0:
+                args.append(ref_val.shape[dim_idx])
+            elif ref_id == 1:
+                args.append(ref_val.stride()[dim_idx])
+            else:
+                raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+
+        # if stream is not None, we need to pass the stream to the library
+        if stream is None:
+            if str(self.target).startswith("cuda") and torch.cuda.is_available():
+                stream = torch.cuda.current_stream().cuda_stream
+            else:
+                stream = 0
+
+        # Get device_id from first tensor for multi-GPU support
+        if not first_tensor.is_cuda:
+            raise ValueError(f"CuTeDSL kernels require CUDA tensors, got tensor on device: {first_tensor.device}")
+        device_id = first_tensor.device.index or 0
+
+        self._forward_from_prebuild_lib(*args, stream=stream, device_id=device_id)
+
+        if len(self.result_idx) == 1:
+            return args[self.result_idx[0]]
+        else:
+            return [args[i] for i in self.result_idx]
+
+    def _convert_torch_func(self) -> Callable[..., torch.Tensor | list[torch.Tensor]]:
+        """Convert to a PyTorch-compatible function.
+
+        Returns
+        -------
+        Callable[..., torch.Tensor | list[torch.Tensor]]
+            A callable function that takes tensors and returns tensor(s)
+        """
+        return self._wrap_forward_from_prebuild_lib
+
+    def _post_init(self):
+        """Override base class _post_init to register cleanup via weakref.finalize."""
+        super()._post_init()
+
+        # Register cleanup for this instance using weakref.finalize
+        # This will automatically call cleanup when the object is garbage collected
+        if self.pymodule is not None and hasattr(self.pymodule, "cleanup_module"):
+            weakref.finalize(self, self._cleanup_module, self.pymodule)
+
+    @staticmethod
+    def _cleanup_module(pymodule):
+        """Cleanup a single adapter instance's CUDA module and contexts.
+
+        This is called automatically when the adapter instance is garbage collected.
+        It can also be called explicitly via the cleanup() instance method.
+        """
+        try:
+            if hasattr(pymodule, "cleanup_module"):
+                pymodule.cleanup_module()
+        except Exception:
+            # Suppress errors during cleanup (might be called during shutdown)
+            pass
+
+    def cleanup(self):
+        """Explicitly cleanup this adapter's CUDA resources.
+
+        This method can be called explicitly to immediately release CUDA resources
+        without waiting for garbage collection. Useful in Jupyter notebooks or tests.
+
+        Note: This is safe to call multiple times as the C++ implementation is idempotent.
+        """
+        self._cleanup_module(self.pymodule)
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/cutedsl/checks.py b/tilelang/jit/adapter/cutedsl/checks.py
new file mode 100644
index 000000000..477cb43b9
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/checks.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import re
+from importlib import metadata as _importlib_metadata
+from importlib.util import find_spec as _find_spec
+import os
+
+_CUTEDSL_PUBLIC_DIST = "nvidia-cutlass-dsl"
+_CUTEDSL_MIN_VERSION = (4, 3, 1)
+_CUTEDSL_BANNED_VERSIONS = {(4, 3, 4)}  # Known broken versions
+_VERSION_TRIPLE_RE = re.compile(r"(\d+)\.(\d+)\.(\d+)")
+
+
+def _parse_version_triple(version_str: str) -> tuple[int, int, int] | None:
+    """Parse a best-effort (major, minor, patch) triple from a version string.
+
+    We intentionally avoid importing heavy/optional version parsers. For our
+    minimum requirement (>= 4.3.1), a numeric triple comparison is sufficient.
+    """
+    m = _VERSION_TRIPLE_RE.search(version_str)
+    if not m:
+        return None
+    return int(m.group(1)), int(m.group(2)), int(m.group(3))
+
+
+def _min_version_str() -> str:
+    return ".".join(map(str, _CUTEDSL_MIN_VERSION))
+
+
+def _requirement_spec() -> str:
+    spec = f"{_CUTEDSL_PUBLIC_DIST}>={_min_version_str()}"
+    for banned in _CUTEDSL_BANNED_VERSIONS:
+        spec += f",!={'.'.join(map(str, banned))}"
+    return spec
+
+
+def check_cutedsl_available() -> None:
+    """Fail fast if the CuTeDSL backend cannot be used in this Python environment.
+
+    Policy:
+    - If the public distribution `nvidia-cutlass-dsl` is installed, require version >= a minimum supported version.
+    - Regardless of distribution metadata, require that `cutlass.cute` is importable.
+
+    This intentionally does not mention or special-case any internal distributions.
+    """
+    # 1) Version gate (only when the public dist metadata is present)
+    try:
+        dist_version = _importlib_metadata.version(_CUTEDSL_PUBLIC_DIST)
+    except _importlib_metadata.PackageNotFoundError:
+        dist_version = None
+    except Exception:
+        # Metadata is best-effort; don't block internal/nonstandard installs here.
+        dist_version = None
+
+    if dist_version is not None:
+        parsed = _parse_version_triple(dist_version)
+        if parsed is None or parsed < _CUTEDSL_MIN_VERSION:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL backend requires `{req}`, but found version `{dist_version}`. Please run: `pip install -U '{req}'`."
+            )
+        if parsed in _CUTEDSL_BANNED_VERSIONS:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL version `{dist_version}` is known to have compatibility issues and is not supported. Please run: `pip install -U '{req}'`."
+            )
+
+    # 2) Capability probe: keep it cheap.
+    # Importing cutlass/cute can be expensive and defeats our lazy-import design,
+    # especially on cache hits. We only require that the module is importable.
+    cutlass_spec = _find_spec("cutlass")
+    if cutlass_spec is None:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
+
+    # Avoid find_spec("cutlass.cute") which can be surprisingly expensive.
+    # Instead, check for a 'cute' submodule/package under cutlass's search locations.
+    locs = getattr(cutlass_spec, "submodule_search_locations", None)
+    has_cute = False
+    if locs:
+        for base in locs:
+            if os.path.isdir(os.path.join(base, "cute")) or os.path.isfile(os.path.join(base, "cute.py")):
+                has_cute = True
+                break
+
+    if not has_cute:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
diff --git a/tilelang/jit/adapter/cutedsl/kernel_cache.py b/tilelang/jit/adapter/cutedsl/kernel_cache.py
new file mode 100644
index 000000000..b498f9797
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/kernel_cache.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+from typing_extensions import override
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class CuTeDSLKernelCache(KernelCache):
+    # CuTeDSL C++ launcher specific
+    kernel_lib_path = "kernel.py"
+    device_kernel_path = "kernel.py"
+    host_kernel_path = "kernel.py"
+    launcher_lib_path = "launcher_lib.so"
+    launcher_cpp_path = "launcher.cpp"
+
+    @override
+    def _save_kernel_source_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        return
+
+    @override
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        # Save C++ launcher library if it exists
+        lib_gen = getattr(kernel.adapter, "lib_generator", None)
+        if lib_gen and hasattr(lib_gen, "launcher_libpath") and lib_gen.launcher_libpath:
+            launcher_lib_path = os.path.join(cache_path, self.launcher_lib_path)
+            src_launcher_path = lib_gen.launcher_libpath
+            if verbose:
+                self.logger.debug(f"Saving C++ launcher library to cache: {src_launcher_path}")
+            KernelCache._safe_write_file(launcher_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_launcher_path)))
+
+        # Optionally save launcher C++ source for debugging
+        if hasattr(kernel.adapter, "launcher_cpp_code") and kernel.adapter.launcher_cpp_code:
+            launcher_cpp_path = os.path.join(cache_path, self.launcher_cpp_path)
+            if verbose:
+                self.logger.debug(f"Saving C++ launcher source to: {launcher_cpp_path}")
+            KernelCache._safe_write_file(launcher_cpp_path, "w", lambda file: file.write(kernel.adapter.launcher_cpp_code))
+
+    @override
+    def _get_required_files(self, cache_path: str) -> list[str]:
+        return super()._get_required_files(cache_path) + [os.path.join(cache_path, self.launcher_lib_path)]
+
+    @override
+    def _set_adapter_cache_path(self, kernel: JITKernel, cache_path: str):
+        if hasattr(kernel, "adapter"):
+            kernel.adapter._cache_path = cache_path
diff --git a/tilelang/jit/adapter/cutedsl/libgen.py b/tilelang/jit/adapter/cutedsl/libgen.py
new file mode 100644
index 000000000..7e2dc2091
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/libgen.py
@@ -0,0 +1,118 @@
+"""CuTeDSL Library Generator for TileLang.
+
+This module provides library generation functionality for the CuTeDSL backend.
+"""
+
+from __future__ import annotations
+import importlib.util
+import os
+import tempfile
+import subprocess
+
+from tvm.target import Target
+
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+
+class CuTeDSLLibraryGenerator(LibraryGenerator):
+    host_func: str | None = None
+    tma_cpp_init_code: str | None = None
+    tma_lib_name: str | None = None
+    launcher_cpp_code: str | None = None
+    launcher_lib_name: str | None = None
+    pymodule = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        self.host_func = host_func
+
+    def update_launcher_cpp_code(self, launcher_cpp_code: str):
+        self.launcher_cpp_code = launcher_cpp_code
+
+    def update_launcher_lib_name(self, launcher_lib_name: str):
+        self.launcher_lib_name = launcher_lib_name
+
+    def load_lib(self, lib_path: str | None = None):
+        if lib_path is None:
+            if self.libpath is None:
+                raise RuntimeError("CuTeDSLLibraryGenerator.libpath is not set; call compile_lib() first or pass lib_path explicitly.")
+            lib_path = self.libpath
+
+        self.pymodule = self.import_from_file("kernel", lib_path)
+
+    def compile_lib(self, timeout: float = None):
+        if self.host_func is None:
+            raise RuntimeError("CuTeDSLLibraryGenerator.host_func is not set; call update_host_func() before compile_lib().")
+        target = self.target
+        if is_cutedsl_target(target):
+            # Use a dedicated temp directory per kernel so CuTeDSL artifacts (e.g. kept .cubin)
+            # never pollute user CWD, and are easy to locate alongside the generated module.
+            work_dir = tempfile.mkdtemp(prefix="tilelang_cutedsl_")
+            src_path = os.path.join(work_dir, "kernel.py")
+            with open(src_path, "w") as f:
+                # Note: lib_code (containing @cute.kernel definitions) is embedded
+                # inside host_func's _generate_cubin_if_needed function, so we only
+                # write host_func here. This ensures cute imports are lazy-loaded.
+                f.write(self.host_func)
+
+            # Compile C++ launcher library if needed
+            if self.launcher_cpp_code is not None:
+                with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    suffix=".cpp",
+                    delete=False,
+                ) as launcher_src:
+                    launcher_src.write(self.launcher_cpp_code)
+                    launcher_src_path = launcher_src.name
+
+                # Generate launcher lib under the same directory as the source file
+                launcher_lib_path = os.path.join(os.path.dirname(src_path), self.launcher_lib_name)
+
+                # Get TVM FFI compiler flags using tvm_ffi.libinfo API
+                try:
+                    import tvm_ffi.libinfo
+
+                    include_paths = tvm_ffi.libinfo.include_paths()
+                    tvm_cxxflags = [f"-I{path}" for path in include_paths]
+                    lib_path = tvm_ffi.libinfo.find_libtvm_ffi()
+                    lib_dir = os.path.dirname(lib_path)
+                    tvm_ldflags = [f"-L{lib_dir}", "-ltvm_ffi"]
+                except (ImportError, RuntimeError):
+                    # tvm_ffi unavailable or libinfo functions failed
+                    tvm_cxxflags = []
+                    tvm_ldflags = []
+
+                # Compile with nvcc (need CUDA driver API)
+                compile_cmd = [
+                    "nvcc",
+                    "-shared",
+                    "-Xcompiler=-fPIC",
+                    "-lcuda",
+                    *tvm_cxxflags,
+                    *tvm_ldflags,
+                    "-o",
+                    launcher_lib_path,
+                    launcher_src_path,
+                ]
+
+                result = subprocess.run(compile_cmd, check=False, capture_output=True, text=True, timeout=timeout)
+                if result.returncode != 0:
+                    raise RuntimeError(f"Failed to compile C++ launcher: {result.stderr}")
+
+                self.launcher_libpath = launcher_lib_path
+                self.launcher_libname = self.launcher_lib_name
+
+            self.srcpath = src_path
+            self.libpath = src_path
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/jit/adapter/cutedsl/wrapper.py b/tilelang/jit/adapter/cutedsl/wrapper.py
new file mode 100644
index 000000000..1cd5d8e0b
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/wrapper.py
@@ -0,0 +1,1467 @@
+"""CuTeDSL Source Wrapper for TileLang.
+
+This module provides C++ kernel launcher generation for the CuTeDSL backend.
+
+Key features:
+- Automatic C++ launcher generation with CUDA Driver API
+- TMA descriptors on HOST memory, passed via __grid_constant__ (no device copy needed)
+- cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+- Support for single and multiple kernel launches
+- Complete cache system integration
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import (
+    extract_python_func_declaration,
+    pythonic_expr,
+    parse_tma_descriptor_args,
+)
+
+# =============================================================================
+# C++ LAUNCHER TEMPLATES (using named parameters for clarity)
+# =============================================================================
+
+# TMA single descriptor initialization template (writes to caller-provided host array)
+# No device copy needed - cuLaunchKernel handles __grid_constant__ params automatically
+CPP_TMA_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name})
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t boxDim[{rank}] = {{{box_dim_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+
+    result = cuTensorMapEncodeTiled(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        boxDim,
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA single im2col descriptor initialization template (writes to caller-provided host array)
+# Align field ordering with NVRTC wrapper (cuTensorMapEncodeIm2col signature).
+CPP_TMA_IM2COL_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name}) [im2col]
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+    int32_t lowerCorner[{rank_minus_two}] = {{{lower_corner_values}}};
+    int32_t upperCorner[{rank_minus_two}] = {{{upper_corner_values}}};
+
+    result = cuTensorMapEncodeIm2col(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        lowerCorner,
+        upperCorner,
+        static_cast<uint32_t>({channels_per_pixel}),
+        static_cast<uint32_t>({pixels_per_column}),
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA im2col descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA initialization function template (writes to caller-provided host array)
+# __grid_constant__ allows kernel to receive TMA descriptor by value via param space
+CPP_TMA_INIT_FUNC_TEMPLATE = """\
+CUresult tma_init(CUtensorMap* tma_descs, {func_args}) {{
+  // Initialize {num_descs} TMA descriptor(s) in caller-provided host array
+  // cuLaunchKernel will copy 128-byte CUtensorMap to kernel param space automatically
+  CUresult result;
+
+{desc_init_code}
+
+  return CUDA_SUCCESS;
+}}
+"""
+
+# Kernel initialization template
+CPP_KERNEL_INIT_TEMPLATE = """\
+  // Find and configure kernel {kernel_idx}: {kernel_name}
+  result = find_kernel_by_pattern(module, "{kernel_name}", &kernels[{kernel_idx}]);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to find kernel {kernel_name} on device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  if ({smem_size} > 0) {{
+    result = cuFuncSetAttribute(kernels[{kernel_idx}],
+                                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                                {smem_size});
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to set smem for {kernel_name} on device " << device_id << ": " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA launch initialization template (host memory mode - uses __grid_constant__)
+# Kernel receives TMA descriptor by value: .param .align 128 .b8 xxx_param[128]
+CPP_TMA_LAUNCH_INIT_TEMPLATE = """\
+  // Declare stack-local TMA descriptor array (eliminates concurrency race)
+  CUtensorMap tma_descs[{num_tma_descs}];
+
+  // Initialize TMA descriptors (HOST memory - passed via __grid_constant__)
+  // NOTE: We intentionally do NOT reuse/cached descriptors across launches.
+  // Pointer-only reuse is a correctness trap (shape/stride may change with same ptr),
+  // and correctness beats micro-optimizations.
+  result = tma_init(tma_descs, {tma_tensor_args});
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize TMA descriptors: " << result << "\\n";
+    return result;
+  }}
+"""
+
+# Kernel launch template
+CPP_KERNEL_LAUNCH_TEMPLATE = """\
+  // Launch kernel {kernel_idx}: {kernel_name}
+  {{
+    // Get the kernel for current device
+    auto kernels_it = g_device_kernels.find(device_id);
+    if (kernels_it == g_device_kernels.end()) {{
+      std::cerr << "Kernels not initialized for device " << device_id << "\\n";
+      return CUDA_ERROR_NOT_INITIALIZED;
+    }}
+    const std::vector<CUfunction>& kernels = kernels_it->second;
+
+    void* args[] = {{{kernel_args}}};
+    result = cuLaunchKernel(
+        kernels[{kernel_idx}],
+        {grid_x}, {grid_y}, {grid_z},
+        {block_x}, {block_y}, {block_z},
+        {smem_size},
+        stream,
+        args,
+        nullptr
+    );
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to launch kernel {kernel_name} on device " << device_id << ": " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# Complete C++ launcher template
+CPP_LAUNCHER_TEMPLATE = """\
+#include <cuda.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+#include <string>
+#include <mutex>
+#include <unordered_map>
+
+// TVM Headers
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+// Per-device module and kernel storage for multi-GPU support
+// Each device needs its own CUmodule because modules are tied to CUDA contexts
+static std::unordered_map<int, CUmodule> g_device_modules;
+static std::unordered_map<int, std::vector<CUfunction>> g_device_kernels;
+static std::unordered_map<int, CUcontext> g_device_contexts;  // Track retained contexts for cleanup
+static std::mutex g_devices_mutex;
+
+// Find kernel by pattern (substring match, prefer base name over _N variants)
+CUresult find_kernel_by_pattern(CUmodule module, const char* pattern, CUfunction* out_func) {{
+  CUresult result;
+  unsigned int num_funcs = 0;
+
+  result = cuModuleGetFunctionCount(&num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get function count: " << result << "\\n";
+    return result;
+  }}
+
+  std::vector<CUfunction> func_list(num_funcs);
+  result = cuModuleEnumerateFunctions(func_list.data(), num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to enumerate functions: " << result << "\\n";
+    return result;
+  }}
+
+  // Collect substring matches, separating base name from _N variants
+  std::vector<std::pair<std::string, CUfunction>> base_matches;     // pattern not followed by _digit
+  std::vector<std::pair<std::string, CUfunction>> variant_matches;  // pattern followed by _digit
+
+  size_t pattern_len = std::strlen(pattern);
+
+  for (unsigned int i = 0; i < num_funcs; i++) {{
+    const char* func_name = nullptr;
+    result = cuFuncGetName(&func_name, func_list[i]);
+    if (result != CUDA_SUCCESS || func_name == nullptr) {{
+      std::cerr << "Failed to get function name: " << result << "\\n";
+      return result;
+    }}
+
+    std::string name_str(func_name);
+    size_t pos = name_str.find(pattern);
+
+    if (pos != std::string::npos) {{
+      // Found substring match
+      size_t after_pattern = pos + pattern_len;
+
+      // Check what follows the pattern
+      if (after_pattern < name_str.length() &&
+          name_str[after_pattern] == '_' &&
+          after_pattern + 1 < name_str.length() &&
+          std::isdigit(name_str[after_pattern + 1])) {{
+        // Pattern followed by _digit (e.g., "main_kernel_1")
+        variant_matches.push_back({{name_str, func_list[i]}});
+      }} else {{
+        // Pattern not followed by _digit (e.g., "main_kernel" itself)
+        base_matches.push_back({{name_str, func_list[i]}});
+      }}
+    }}
+  }}
+
+  // Decision logic: prefer base matches over variant matches
+  if (!base_matches.empty()) {{
+    if (base_matches.size() == 1) {{
+      *out_func = base_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple base matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << base_matches.size()
+              << " base kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : base_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern.\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No base matches, try variant matches
+  if (!variant_matches.empty()) {{
+    if (variant_matches.size() == 1) {{
+      *out_func = variant_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple variant matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << variant_matches.size()
+              << " variant kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : variant_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern (e.g., '" << pattern << "_1').\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No matches at all
+  std::cerr << "Failed to find kernel matching pattern '" << pattern << "'\\n";
+  return CUDA_ERROR_NOT_FOUND;
+}}
+
+
+// Initialize CUDA module for a specific device (called once per device)
+// Thread-safe and supports multi-GPU by tracking modules per device
+// device_id: PyTorch CUDA device ID (e.g., 0, 1, 2...)
+static CUresult tilelang_init_cuda_module(const std::string& cubin_path, int device_id) {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  // Fast path: module already initialized for this device
+  if (g_device_modules.find(device_id) != g_device_modules.end()) {{
+    return CUDA_SUCCESS;
+  }}
+
+  CUresult result;
+  result = cuInit(0);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize CUDA: " << result << "\\n";
+    return result;
+  }}
+
+  // Get device handle for the requested device_id
+  CUdevice device;
+  result = cuDeviceGet(&device, device_id);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get CUDA device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Retain and set the primary context for this device
+  // PyTorch (Runtime API) creates and activates the primary context
+  // We need to explicitly acquire it via Driver API and set it as current
+  CUcontext ctx;
+  result = cuDevicePrimaryCtxRetain(&ctx, device);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to retain primary context for device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  result = cuCtxSetCurrent(ctx);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to set current context for device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Store the retained context for cleanup
+  g_device_contexts[device_id] = ctx;
+
+  // Read cubin file
+  std::ifstream cubin_file(cubin_path.c_str(), std::ios::binary);
+  if (!cubin_file) {{
+    std::cerr << "Failed to open cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_FILE_NOT_FOUND;
+  }}
+
+  std::vector<char> cubin_data((std::istreambuf_iterator<char>(cubin_file)),
+                                std::istreambuf_iterator<char>());
+  cubin_file.close();
+
+  if (cubin_data.empty()) {{
+    std::cerr << "Empty cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_INVALID_IMAGE;
+  }}
+
+  // Load module for this specific device
+  CUmodule module;
+  result = cuModuleLoadData(&module, cubin_data.data());
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to load CUDA module on device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Store module for this device
+  g_device_modules[device_id] = module;
+
+  return CUDA_SUCCESS;
+}}
+
+// Initialize kernel functions for a specific device (called once per device)
+// Thread-safe and supports multi-GPU by tracking kernels per device
+static CUresult tilelang_init_kernels(int device_id) {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  // Fast path: kernels already initialized for this device
+  if (g_device_kernels.find(device_id) != g_device_kernels.end()) {{
+    return CUDA_SUCCESS;
+  }}
+
+  // Get the module for this device
+  auto module_it = g_device_modules.find(device_id);
+  if (module_it == g_device_modules.end()) {{
+    std::cerr << "Module not initialized for device " << device_id << "\\n";
+    return CUDA_ERROR_NOT_INITIALIZED;
+  }}
+  CUmodule module = module_it->second;
+
+  // Initialize kernel storage for this device
+  std::vector<CUfunction> kernels({num_kernels});
+  CUresult result;
+
+{kernel_inits}
+
+  // Store kernels for this device
+  g_device_kernels[device_id] = kernels;
+
+  return CUDA_SUCCESS;
+}}
+
+// TMA descriptor initialization (host-side)
+{tma_init_func}
+
+// Main kernel launcher
+extern "C" CUresult launch_kernel({launch_func_sig}, uint64_t _stream, int device_id, tvm::ffi::Bytes cubin_path) {{
+  CUresult result;
+
+  std::string cubin_path_str(reinterpret_cast<const char*>(cubin_path.data()), cubin_path.size());
+  result = tilelang_init_cuda_module(cubin_path_str, device_id);
+  if (result != CUDA_SUCCESS) return result;
+
+  result = tilelang_init_kernels(device_id);
+  if (result != CUDA_SUCCESS) return result;
+
+{get_ptr_code}
+  CUstream stream = (CUstream)_stream;
+
+{tma_init_in_launch}
+
+{kernel_launches}
+
+  return CUDA_SUCCESS;
+}}
+
+// Cleanup function
+extern "C" CUresult cleanup_module() {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  CUresult last_error = CUDA_SUCCESS;
+
+  // Step 1: Unload modules for all devices
+  for (auto& pair : g_device_modules) {{
+    if (pair.second != nullptr) {{
+      CUresult result = cuModuleUnload(pair.second);
+      if (result != CUDA_SUCCESS) {{
+        std::cerr << "Failed to unload module for device " << pair.first
+                  << ": " << result << "\\n";
+        last_error = result;
+        // Continue cleanup even if unload fails
+      }}
+    }}
+  }}
+
+  // Step 2: Release primary contexts (must execute even if module unload failed)
+  // This ensures the reference count is decremented for every cuDevicePrimaryCtxRetain
+  for (auto& pair : g_device_contexts) {{
+    int device_id = pair.first;
+    CUcontext ctx = pair.second;
+
+    if (ctx != nullptr) {{
+      CUdevice device;
+      CUresult result = cuDeviceGet(&device, device_id);
+      if (result == CUDA_SUCCESS) {{
+        result = cuDevicePrimaryCtxRelease(device);
+        if (result != CUDA_SUCCESS) {{
+          std::cerr << "Failed to release primary context for device "
+                    << device_id << ": " << result << "\\n";
+          last_error = result;
+        }}
+      }} else {{
+        std::cerr << "Failed to get device " << device_id
+                  << " for context release: " << result << "\\n";
+        last_error = result;
+      }}
+    }}
+  }}
+
+  // Step 3: Clear all maps
+  g_device_modules.clear();
+  g_device_kernels.clear();
+  g_device_contexts.clear();
+
+  return last_error;
+}}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(launch_kernel, launch_kernel);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(cleanup_module, cleanup_module);
+"""
+
+# =============================================================================
+# PYTHON CUBIN GENERATION TEMPLATES
+# =============================================================================
+
+# TMA descriptor atom initialization template
+CUBIN_TMA_ATOM_INIT_TEMPLATE = """\
+    {desc_name} = tl.Gemm_SM90.get_tma_atom(__fake_tensor__, (32, 32))"""
+
+# Kernel launch call template
+CUBIN_KERNEL_LAUNCH_TEMPLATE = """\
+    {function_name}({call_args}).launch(
+      grid=[{grid_x}, {grid_y}, {grid_z}],
+      block=[{block_x}, {block_y}, {block_z}],
+      smem={smem_size},
+      stream=stream,
+    )"""
+
+# Fake tensor creation template
+CUBIN_FAKE_TENSOR_TEMPLATE = """\
+  __fake_{arg_name}__ = make_fake_compact_tensor(_DTYPE_MAP[str({arg_name}.dtype)], {arg_name}.shape, stride_order={arg_name}.dim_order()[::-1], assumed_align=16)"""
+
+# Complete cubin generation code template
+# {lib_code} contains the @cute.kernel definitions and is embedded here
+CUBIN_GEN_CODE_TEMPLATE = """\
+{lib_code}
+
+  @cute.jit
+  def kernel_wrapper({wrapper_args}):
+{tma_init_code}{kernel_launches}
+
+  # Compile kernels to generate cubin
+{fake_tensor_code}{fake_tma_tensor_code}  __fake_stream__ = make_fake_stream()
+  # Always generate cubin under a unique staging directory to avoid concurrent
+  # processes clobbering each other's intermediate artifacts.
+  _staging_dir = Path(tempfile.mkdtemp(
+      prefix=Path(__file__).stem + ".cubin.staging.",
+      dir=_module_dir,
+  ))
+  try:
+    _kernel_wrapper = cute.compile(
+        kernel_wrapper,
+        {compile_args},
+        options=f"--enable-tvm-ffi --keep-cubin --dump-dir={{_staging_dir.as_posix()}}",
+    )
+
+    # CuTeDSL generates a long, mangled cubin filename that includes argument/type info,
+    # e.g. "cutlass_kernel_wrapper_FakeTensor...sm_90a.cubin". We expect exactly one cubin.
+    _cubin_files = sorted(_staging_dir.glob("*.cubin"), key=lambda p: p.stat().st_mtime)
+    if len(_cubin_files) != 1:
+      raise RuntimeError(
+          f"Expected exactly one .cubin under {{_staging_dir}}, got {{len(_cubin_files)}}: {{_cubin_files}}"
+      )
+    os.replace(_cubin_files[0], _cubin_path)
+  finally:
+    shutil.rmtree(_staging_dir, ignore_errors=True)"""
+
+# =============================================================================
+# PYTHON HOST FUNCTION TEMPLATE
+# =============================================================================
+
+PYTHON_HOST_FUNC_TEMPLATE = """\
+import os
+from pathlib import Path
+
+# Minimal imports for runtime (no cutlass/cute - only needed for cubin generation)
+import tvm.runtime as runtime
+
+_cpp_launcher = None
+_cpp_launcher_lib = None
+_cubin_generated = False
+
+# Pre-compute paths - cubin is stored alongside the launcher .so
+# Use module basename to avoid conflicts when multiple kernels run concurrently
+# e.g., "/tmp/tmp8liu__ho.py" -> "/tmp/tmp8liu__ho.cubin"
+#       "kernel.py" (in cache) -> "kernel.cubin"
+_module_dir = Path(os.path.dirname(__file__))
+_cubin_path = _module_dir / (Path(__file__).stem + ".cubin")
+_cubin_path_bytes = _cubin_path.as_posix().encode('utf-8')
+_cubin_needs_generation = not _cubin_path.exists()
+
+def _generate_cubin_if_needed({cubin_gen_params}):
+  \"\"\"Generate cubin file on first call.
+
+  All CuTeDSL imports are inside this function to avoid slow
+  module-level initialization when loading from cache.
+  \"\"\"
+  global _cubin_generated, _cubin_path
+
+  # Lazy import CuTeDSL only when cubin generation is needed
+  from cuda.bindings.driver import CUstream
+  import cutlass
+  import cutlass.cute as cute
+  from cutlass.cute.runtime import make_fake_stream, make_fake_compact_tensor
+  import tilelang.contrib.cutedsl as tl
+  # We rely on CuTeDSL's keep-cubin artifact rather than custom extraction.
+  import tempfile
+  import shutil
+
+  _DTYPE_MAP = {{
+      "torch.float32": cutlass.Float32,
+      "torch.float16": cutlass.Float16,
+      "torch.bfloat16": cutlass.BFloat16,
+      "torch.float8_e4m3fnuz": cutlass.Float8E4M3FN,
+      "torch.float8_e4m3fn": cutlass.Float8E4M3FN,
+      "torch.float8_e5m2": cutlass.Float8E5M2,
+      "torch.float64": cutlass.Float64,
+      "torch.int64": cutlass.Int64,
+      "torch.int32": cutlass.Int32,
+      "torch.uint32": cutlass.Uint32,
+      "torch.bool": cutlass.Boolean,
+      "torch.int8": cutlass.Int8,
+      "torch.uint8": cutlass.Uint8,
+      "torch.int16": cutlass.Int16,
+      "torch.uint16": cutlass.Uint16,
+      "torch.uchar": cutlass.Uint8,
+  }}
+
+{cubin_gen_code}
+
+  _cubin_generated = True
+
+def _load_cpp_launcher():
+  \"\"\"Load C++ kernel launcher.\"\"\"
+  global _cpp_launcher, _cpp_launcher_lib
+  if _cpp_launcher is not None:
+    return _cpp_launcher
+
+  lib_path = os.path.join(os.path.dirname(__file__), "{launcher_lib_name}")
+  if not os.path.exists(lib_path):
+    raise FileNotFoundError(f"Launcher not found: {{lib_path}}")
+
+  _cpp_launcher_lib = runtime.load_module(lib_path)
+  _cpp_launcher = _cpp_launcher_lib["launch_kernel"]
+  return _cpp_launcher
+
+def call({call_func_params}, stream, device_id=0):
+  \"\"\"Kernel dispatch function.
+
+  Args:
+      stream: CUDA stream handle
+      device_id: CUDA device ID (should be passed from caller, defaults to 0 for backward compatibility)
+  \"\"\"
+  global _cubin_path_bytes, _cubin_needs_generation
+
+  if _cubin_needs_generation:
+    _generate_cubin_if_needed({cubin_gen_call_args})
+    _cubin_needs_generation = False
+
+{arg_prep_code}
+
+  launcher = _load_cpp_launcher()
+  result = launcher({launcher_call_args}, stream, device_id, _cubin_path_bytes)
+
+  if result != 0:
+    raise RuntimeError(f"Kernel launch failed with CUDA error {{result}}")
+"""
+
+# =============================================================================
+# WRAPPER CLASS
+# =============================================================================
+
+
+class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
+    """Wrapper class for TileLang CuTe DSL backend with C++ launcher.
+
+    Generates optimized C++ launcher code that:
+    - Loads cubin via CUDA Driver API
+    - Passes TMA descriptors by value (host-side, no device copy)
+    - Launches kernels with minimal Python overhead
+    - Supports both single and multiple kernel scenarios
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "cutlass.Float32",
+        "float16": "cutlass.Float16",
+        "bfloat16": "cutlass.BFloat16",
+        "float8_e4m3": "cutlass.Float8E4M3",
+        "float8_e5m2": "cutlass.Float8E5M2",
+        "float64": "cutlass.Float64",
+        "int64": "cutlass.Int64",
+        "int32": "cutlass.Int32",
+        "uint32": "cutlass.Uint32",
+        "bool": "cutlass.Boolean",
+        "int8": "cutlass.Int8",
+        "uint8": "cutlass.Uint8",
+        "int16": "cutlass.Int16",
+        "uint16": "cutlass.Uint16",
+        "uchar": "cutlass.Uint8",
+    }
+
+    # C++ launcher code must not depend on cutlass Python types.
+    # Use plain C/C++ types for expression rendering inside generated .cpp.
+    _CXX_TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "float",
+        "float64": "double",
+        "int64": "int64_t",
+        "int32": "int32_t",
+        "uint32": "uint32_t",
+        "bool": "bool",
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+    }
+
+    _CTYPES_MAP: ClassVar[dict[str, str]] = {
+        "buffer": "ctypes.c_uint64",
+        "cutlass.Float32": "ctypes.c_float",
+        "cutlass.Float16": "ctypes.c_uint16",
+        "cutlass.Float64": "ctypes.c_double",
+        "cutlass.Int64": "ctypes.c_int64",
+        "cutlass.Int32": "ctypes.c_int32",
+        "cutlass.Uint32": "ctypes.c_uint32",
+        "cutlass.Int8": "ctypes.c_int8",
+        "cutlass.Uint8": "ctypes.c_uint8",
+        "cutlass.Int16": "ctypes.c_int16",
+        "cutlass.Uint16": "ctypes.c_uint16",
+        "int": "ctypes.c_int32",
+    }
+
+    _generated_host_func: str | None = None
+    _launcher_lib_name: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    # =========================================================================
+    # Utility Methods
+    # =========================================================================
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string."""
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="//")
+
+    def _cxx_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to C++ string for generated launcher code."""
+        return pythonic_expr(expr, self._CXX_TYPE_MAP)
+
+    @staticmethod
+    def _cxx_cast(ctype: str, expr_str: str) -> str:
+        return f"static_cast<{ctype}>({expr_str})"
+
+    def _collect_function_args(self) -> tuple[list[dict], list[str]]:
+        """Collect all function arguments from primary function.
+
+        Returns:
+            Tuple of (function_args, buffer_args)
+        """
+        function_args = []
+        buffer_args = []
+
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append({"name": buffer.data.name, "type": "buffer"})
+                buffer_args.append(buffer.data.name)
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+            else:
+                raise ValueError(f"Parameter {param} not in buffer map")
+
+        existing_names = {arg["name"] for arg in function_args}
+        for dyn_sym in self.get_dynamic_symbolic_set(self.prim_func):
+            dyn_sym_name, dyn_sym_dtype = dyn_sym if isinstance(dyn_sym, tuple) else (dyn_sym, "int32")
+            if dyn_sym_name in existing_names:
+                continue
+            existing_names.add(dyn_sym_name)
+            function_args.append({"name": dyn_sym_name, "type": self._TYPE_MAP.get(dyn_sym_dtype, "int")})
+
+        return function_args, buffer_args
+
+    @staticmethod
+    def _extract_func_call_args(
+        declaration: str,
+        function_args: list[dict],
+        function_params: list,
+        desc_name_map: dict[str, str] | None = None,
+        desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    ) -> list[tuple[str, str]]:
+        """Extract function call arguments from Python function declaration."""
+
+        def maybe_desc(name: str | tuple[str, str], param_names: list[str], i: int):
+            name_str = name if isinstance(name, str) else name[0]
+            param = param_names[i]
+            if not (param == name_str + "_desc" or param.startswith(name_str + "_desc_")):
+                return False
+            if desc_name_map is not None:
+                desc_name_map[param] = name_str
+            return True
+
+        def extract_param_names_ast(decl: str) -> list[str] | None:
+            """Extract parameter names using AST parsing."""
+            import ast
+            import warnings
+
+            try:
+                # Build a syntactically valid function by adding a body
+                func_stub = decl.rstrip()
+                if not func_stub.endswith(":"):
+                    func_stub += ":"
+                func_stub += "\n    pass"
+
+                # Parse and locate the FunctionDef
+                tree = ast.parse(func_stub)
+                func_def = None
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.FunctionDef):
+                        func_def = node
+                        break
+
+                if func_def is None:
+                    return None
+
+                # Extract parameter names, skipping 'self'
+                param_names = []
+                for arg in func_def.args.args:
+                    if arg.arg != "self":
+                        param_names.append(arg.arg)
+
+                return param_names
+            except Exception as e:
+                warnings.warn(f"AST parsing failed for function declaration, falling back to split-based parsing: {e}", stacklevel=2)
+                return None
+
+        def extract_param_names_split(decl: str) -> list[str]:
+            """Fallback: extract parameter names using naive split-based parsing."""
+            paren_start = decl.find("(")
+            paren_end = decl.rfind(")")
+            if paren_start == -1 or paren_end == -1:
+                return []
+
+            params_str = decl[paren_start + 1 : paren_end].strip()
+            if not params_str:
+                return []
+
+            param_parts = params_str.split(",")
+            param_names = []
+            for param in param_parts:
+                param = param.strip()
+                if not param or param == "self":
+                    continue
+                if ":" in param:
+                    param_name = param.split(":")[0].strip()
+                else:
+                    param_name = param.strip()
+                param_names.append(param_name)
+
+            return param_names
+
+        # Try AST-based extraction first, fallback to split-based
+        param_names = extract_param_names_ast(declaration)
+        if param_names is None:
+            param_names = extract_param_names_split(declaration)
+
+        call_args = []
+        for i, param_name in enumerate(param_names):
+            for arg in function_args:
+                if arg["name"] == param_name:
+                    call_args.append((param_name, arg["type"]))
+                elif maybe_desc(arg["name"], param_names, i):
+                    call_args.append((param_name, "None"))
+                    if desc_name_var_map is not None and function_params is not None:
+                        assert len(call_args) <= len(function_params)
+                        desc_name_var_map[param_name] = function_params[len(call_args) - 1]
+        return call_args
+
+    @staticmethod
+    def _filter_non_descriptor_args(
+        call_args: list[tuple[str, str]], desc_names: list[str], tma_tensors: list[str]
+    ) -> list[tuple[str, str]]:
+        """Filter out descriptor arguments."""
+        filtered = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                continue
+            if arg_name in tma_tensors:
+                continue
+            filtered.append((arg_name, arg_type))
+        return filtered
+
+    # =========================================================================
+    # TMA Descriptor Code Generation
+    # =========================================================================
+
+    def _generate_tma_desc_init(self, desc_name: str, desc_idx: int, tensor_name: str, info: dict) -> str:
+        """Generate single TMA descriptor initialization code."""
+        if info.get("is_img2col", False):
+            rank = info["tensor_rank"]
+            return CPP_TMA_IM2COL_DESC_INIT_TEMPLATE.format(
+                desc_idx=desc_idx,
+                desc_name=desc_name,
+                tensor_name=tensor_name,
+                rank=rank,
+                stride_rank=rank - 1,
+                rank_minus_two=rank - 2,
+                global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+                global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+                elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+                lower_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["lower_corner"]),
+                upper_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["upper_corner"]),
+                # Match NVRTC wrapper naming: channelsPerPixel then pixelsPerColumn
+                channels_per_pixel=info["smem_box_channel"],
+                pixels_per_column=info["smem_box_pixel"],
+                dtype=info["dtype"],
+                interleave=info["interleave"],
+                swizzle=info["swizzle"],
+                l2_promotion=info["l2Promotion"],
+                oob_fill=info["oobFill"],
+            )
+
+        return CPP_TMA_DESC_INIT_TEMPLATE.format(
+            desc_idx=desc_idx,
+            desc_name=desc_name,
+            tensor_name=tensor_name,
+            rank=info["tensor_rank"],
+            global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+            stride_rank=info["tensor_rank"] - 1,
+            global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+            box_dim_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["box_dim"]),
+            elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+            dtype=info["dtype"],
+            interleave=info["interleave"],
+            swizzle=info["swizzle"],
+            l2_promotion=info["l2Promotion"],
+            oob_fill=info["oobFill"],
+        )
+
+    def _generate_tma_init_func(
+        self,
+        desc_names: list[str],
+        tensor_args: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+        scalar_args: list[dict[str, str]],
+    ) -> str:
+        """Generate TMA init function code (creates descriptors in caller-provided host array).
+
+        TMA descriptors are stored in stack-local tma_descs[] array in launch_kernel.
+        cuLaunchKernel automatically handles __grid_constant__ params.
+        """
+        if not desc_names:
+            return ""
+
+        func_args_parts = [f"uint64_t {arg}_ptr" for arg in tensor_args]
+        for arg in scalar_args:
+            if arg["type"] in ["int", "cutlass.Int32"]:
+                func_args_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_args_parts.append(f"float {arg['name']}")
+            else:
+                # Default to int32_t for scalars used in shape/stride math
+                func_args_parts.append(f"int32_t {arg['name']}")
+        func_args = ", ".join(func_args_parts)
+        num_descs = len(desc_names)
+
+        desc_inits = []
+        for idx, desc_name in enumerate(desc_names):
+            info = self.tma_desc_info[desc_name]
+            tensor_name, _ = tensor_arg_map[desc_name]
+            desc_inits.append(self._generate_tma_desc_init(desc_name, idx, tensor_name, info))
+
+        return CPP_TMA_INIT_FUNC_TEMPLATE.format(
+            func_args=func_args,
+            num_descs=num_descs,
+            desc_init_code="\n".join(desc_inits),
+        )
+
+    def _generate_tma_launch_init(
+        self, desc_names: list[str], tma_tensors: list[str], scalar_args: list[dict[str, str]], num_tma_descs: int
+    ) -> str:
+        """Generate TMA initialization code for launch function (host memory mode).
+
+        TMA descriptors stay on host. cuLaunchKernel copies them to param space
+        when kernel uses __grid_constant__ CUtensorMap parameter.
+        """
+        if not desc_names:
+            return ""
+
+        # Generate tma_init call args (no device_ptr needed)
+        call_args_parts = [f"{arg}_ptr" for arg in tma_tensors] + [arg["name"] for arg in scalar_args]
+        tma_tensor_args = ", ".join(call_args_parts)
+
+        return CPP_TMA_LAUNCH_INIT_TEMPLATE.format(
+            num_tma_descs=num_tma_descs,
+            tma_tensor_args=tma_tensor_args,
+        )
+
+    # =========================================================================
+    # Kernel Code Generation
+    # =========================================================================
+
+    def _generate_kernel_init(self, kernel_idx: int, kernel_name: str, smem_size: int) -> str:
+        """Generate kernel initialization code."""
+        return CPP_KERNEL_INIT_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_name,
+            smem_size=smem_size,
+        )
+
+    def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_names: list[str]) -> str:
+        """Generate single kernel launch code.
+
+        For __grid_constant__ CUtensorMap params:
+        - Pass CUtensorMap* directly (not CUtensorMap**)
+        - cuLaunchKernel copies 128 bytes to kernel param space
+        """
+        call_args = kernel_meta["call_args"]
+        desc_names = kernel_meta["desc_names"]
+        function_info = kernel_meta["function_info"]
+
+        # Build kernel args
+        kernel_args = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                # For __grid_constant__ CUtensorMap: pass host pointer directly
+                # cuLaunchKernel will copy 128-byte CUtensorMap to param space
+                desc_idx = all_desc_names.index(arg_name)
+                kernel_args.append(f"&tma_descs[{desc_idx}]")
+            elif arg_type == "buffer":
+                kernel_args.append(f"&{arg_name}_ptr")
+            else:
+                kernel_args.append(f"&{arg_name}")
+
+        grid = function_info["grid_info"]
+        block = function_info["block_info"]
+        smem_size = function_info["dynamic_smem_buf"] or 0
+
+        return CPP_KERNEL_LAUNCH_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_meta["function_name"],
+            kernel_args=", ".join(kernel_args),
+            grid_x=self._cxx_expr(grid[0]),
+            grid_y=self._cxx_expr(grid[1]),
+            grid_z=self._cxx_expr(grid[2]),
+            block_x=self._cxx_expr(block[0]),
+            block_y=self._cxx_expr(block[1]),
+            block_z=self._cxx_expr(block[2]),
+            smem_size=smem_size,
+        )
+
+    # =========================================================================
+    # C++ Launcher Generation
+    # =========================================================================
+
+    def _generate_cpp_launcher(
+        self,
+        kernel_metadata_list: list[dict],
+        function_args: list[dict],
+        all_tma_tensors: list[str],
+        all_desc_names: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+    ) -> str:
+        """Generate complete C++ launcher code using templates.
+
+        TMA descriptors are stored on HOST memory in stack-local tma_descs[] array.
+        cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+        when kernel uses __grid_constant__ parameter.
+        """
+        num_kernels = len(kernel_metadata_list)
+        num_tma_descs = max(len(all_desc_names), 1)  # At least 1 to avoid zero-size array
+
+        # Generate kernel inits
+        kernel_inits = "\n".join(
+            self._generate_kernel_init(idx, km["function_name"], km["function_info"]["dynamic_smem_buf"] or 0)
+            for idx, km in enumerate(kernel_metadata_list)
+        )
+
+        # Generate TMA init function
+        scalar_args = [arg for arg in function_args if arg["type"] != "buffer"]
+        tma_init_func = self._generate_tma_init_func(all_desc_names, all_tma_tensors, tensor_arg_map, scalar_args)
+
+        # Generate launch function signature and get_ptr code
+        func_sig_parts = []
+        get_ptr_code = ""
+        for arg in function_args:
+            if arg["type"] == "buffer":
+                func_sig_parts.append(f"tvm::ffi::TensorView {arg['name']}")
+                get_ptr_code += f"  uint64_t {arg['name']}_ptr = reinterpret_cast<uint64_t>({arg['name']}.data_ptr());\n"
+            elif arg["type"] in ["int", "cutlass.Int32"]:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_sig_parts.append(f"float {arg['name']}")
+            else:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+
+        # Generate TMA init in launch
+        tma_init_in_launch = self._generate_tma_launch_init(all_desc_names, all_tma_tensors, scalar_args, num_tma_descs)
+
+        # Generate kernel launches
+        kernel_launches = "\n".join(self._generate_kernel_launch(km, idx, all_desc_names) for idx, km in enumerate(kernel_metadata_list))
+
+        return CPP_LAUNCHER_TEMPLATE.format(
+            num_kernels=num_kernels,
+            num_tma_descs=num_tma_descs,
+            kernel_inits=kernel_inits,
+            tma_init_func=tma_init_func,
+            launch_func_sig=", ".join(func_sig_parts),
+            get_ptr_code=get_ptr_code,
+            tma_init_in_launch=tma_init_in_launch,
+            kernel_launches=kernel_launches,
+        )
+
+    # =========================================================================
+    # Python Wrapper Generation
+    # =========================================================================
+
+    def _generate_cubin_gen_code(
+        self,
+        kernel_metadata_list: list[dict],
+        buffer_args: list[str],
+        all_desc_names: list[str],
+        lib_code: str = "",
+    ) -> str:
+        """Generate cubin generation code for Python wrapper using templates.
+
+        Args:
+            lib_code: The CuTeDSL kernel definitions (@cute.kernel decorated functions).
+                      This will be embedded inside _generate_cubin_if_needed to enable
+                      lazy loading of cutlass/cute modules.
+        """
+        # Build unified wrapper parameters
+        wrapper_params_union = []
+        for kernel_meta in kernel_metadata_list:
+            for arg_name, _ in kernel_meta["call_args"]:
+                if arg_name not in wrapper_params_union:
+                    wrapper_params_union.append(arg_name)
+
+        # Build inner args for cute.compile
+        inner_args = []
+        fake_inner_args = []
+        for arg_name in wrapper_params_union:
+            if arg_name in buffer_args:
+                inner_args.append(f"{arg_name}_")
+                fake_inner_args.append(f"__fake_{arg_name}__")
+            elif arg_name in all_desc_names:
+                continue
+            else:
+                inner_args.append(arg_name)
+                fake_inner_args.append(arg_name)
+        if all_desc_names:
+            inner_args.append("__fake_tensor__")
+            fake_inner_args.append("__fake_tensor__")
+        fake_inner_args.append("__fake_stream__")
+
+        # Generate TMA init code
+        tma_init_code = ""
+        if all_desc_names:
+            tma_init_lines = ["    # Create dummy TMA atoms for compilation"]
+            tma_init_lines.extend(CUBIN_TMA_ATOM_INIT_TEMPLATE.format(desc_name=desc_name) for desc_name in all_desc_names)
+            tma_init_code = "\n".join(tma_init_lines) + "\n"
+
+        # Generate kernel launch calls
+        kernel_launches = "\n".join(
+            CUBIN_KERNEL_LAUNCH_TEMPLATE.format(
+                function_name=km["function_name"],
+                call_args=", ".join(arg[0] if arg[0] not in buffer_args else f"{arg[0]}_" for arg in km["call_args"]),
+                grid_x=self._pythonic_expr(km["function_info"]["grid_info"][0]),
+                grid_y=self._pythonic_expr(km["function_info"]["grid_info"][1]),
+                grid_z=self._pythonic_expr(km["function_info"]["grid_info"][2]),
+                block_x=self._pythonic_expr(km["function_info"]["block_info"][0]),
+                block_y=self._pythonic_expr(km["function_info"]["block_info"][1]),
+                block_z=self._pythonic_expr(km["function_info"]["block_info"][2]),
+                smem_size=km["function_info"]["dynamic_smem_buf"] or 0,
+            )
+            for km in kernel_metadata_list
+        )
+
+        # Generate fake tensor creation code
+        # IMPORTANT: Generate fake tensors based on the *union* of parameters actually
+        # passed to cute.compile (wrapper_params_union).
+        #
+        # In multi-kernel cases, a tensor may appear both as a TMA descriptor
+        # (e.g. Output_partial_desc) for one kernel and as a plain tensor argument
+        # (e.g. Output_partial_) for another kernel. Skipping fake tensor creation
+        # just because a matching "{arg}_desc" exists is a correctness bug and
+        # results in undefined names like "__fake_Output_partial__".
+        fake_tensor_code = "\n".join(
+            CUBIN_FAKE_TENSOR_TEMPLATE.format(arg_name=arg_name) for arg_name in wrapper_params_union if arg_name in buffer_args
+        )
+        if fake_tensor_code:
+            fake_tensor_code += "\n"
+
+        # Generate fake TMA tensor code
+        fake_tma_tensor_code = ""
+        if all_desc_names:
+            fake_tma_tensor_code = (
+                "  __fake_tensor__ = make_fake_compact_tensor(cutlass.Int32, (32, 32), stride_order=(1, 0), assumed_align=16)\n"
+            )
+
+        # Indent lib_code to be inside the function
+        indented_lib_code = "\n".join("  " + line if line.strip() else line for line in lib_code.split("\n")) if lib_code else ""
+
+        return CUBIN_GEN_CODE_TEMPLATE.format(
+            lib_code=indented_lib_code,
+            wrapper_args=", ".join(inner_args + ["stream: CUstream"]),
+            tma_init_code=tma_init_code,
+            kernel_launches=kernel_launches,
+            fake_tensor_code=fake_tensor_code,
+            fake_tma_tensor_code=fake_tma_tensor_code,
+            compile_args=", ".join(fake_inner_args),
+            primary_name=kernel_metadata_list[0]["function_name"],
+        )
+
+    def _generate_python_wrapper(
+        self,
+        function_args: list[dict],
+        cubin_gen_code: str,
+        cubin_gen_params: str,
+    ) -> str:
+        """Generate Python wrapper code."""
+        # Build function parameters
+        call_func_params = ", ".join(arg["name"] for arg in function_args)
+        launcher_call_args = ", ".join(arg["name"] for arg in function_args)
+
+        return PYTHON_HOST_FUNC_TEMPLATE.format(
+            cubin_gen_params=cubin_gen_params,
+            cubin_gen_code=cubin_gen_code,
+            launcher_lib_name=self._launcher_lib_name,
+            call_func_params=call_func_params,
+            cubin_gen_call_args=cubin_gen_params,
+            arg_prep_code="",
+            launcher_call_args=launcher_call_args,
+        )
+
+    # =========================================================================
+    # TMA Descriptor Processing
+    # =========================================================================
+
+    def _process_tma_descriptors(self, desc_names: list[str]) -> tuple[list[str], dict[str, tuple[str, int]]]:
+        """Process TMA descriptors and return tensor args and mapping.
+
+        Returns:
+            Tuple of (tensor_args, tensor_arg_map)
+        """
+        if not hasattr(self, "tma_desc_info") or not desc_names:
+            return [], {}
+
+        tensor_args = []
+        tensor_arg_map = {}
+
+        for desc_name in desc_names:
+            info = self.tma_desc_info[desc_name]
+            # Extract the base buffer variable name (must be a Var, not arbitrary expression)
+            global_addr = info["globalAddress"]
+            if not isinstance(global_addr, tvm.tir.Var):
+                raise ValueError(f"TMA globalAddress must be a buffer Var, got {type(global_addr)}: {global_addr}")
+            tensor_name = global_addr.name
+
+            if tensor_name not in tensor_args:
+                tensor_args.append(tensor_name)
+                tensor_arg_map[desc_name] = (tensor_name, len(tensor_args) - 1)
+            else:
+                tensor_arg_map[desc_name] = (tensor_name, tensor_args.index(tensor_name))
+
+        return tensor_args, tensor_arg_map
+
+    def generate_tma_descriptor_args(
+        self,
+        desc_name_map: dict[str, str],
+        desc_name_var_map: dict[str, tvm.tir.Var],
+        tma_desc_code_map: dict[str, str],
+    ) -> list[str]:
+        """Generate TMA descriptor information for C++ code generation.
+
+        Returns:
+            List of descriptor variable names in the order they were processed.
+        """
+        if self.tma_descriptor_args is None:
+            return []
+
+        if not hasattr(self, "tma_desc_info"):
+            self.tma_desc_info = {}
+
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        desc_names_ordered = []
+
+        for params in parsed_params:
+            handle_name = params.handle_name
+
+            if handle_name in tma_desc_code_map:
+                continue
+
+            desc_var = desc_name_var_map[handle_name]
+            args = self.tma_descriptor_args[desc_var]
+            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
+            tensor_rank = int(tensor_rank)
+
+            global_dim = remaining_args[:tensor_rank]
+            global_stride = remaining_args[tensor_rank : 2 * tensor_rank]
+
+            if not params.is_img2col:
+                box_dim = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+                element_strides = remaining_args[3 * tensor_rank : 4 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": False,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "box_dim": box_dim,
+                    "element_strides": element_strides,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+            else:
+                element_strides = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": True,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "element_strides": element_strides,
+                    "lower_corner": params.lower_corner,
+                    "upper_corner": params.upper_corner,
+                    "smem_box_channel": params.smem_box_channel,
+                    "smem_box_pixel": params.smem_box_pixel,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+
+            tma_desc_code_map[handle_name] = ""
+            desc_names_ordered.append(handle_name)
+
+        return desc_names_ordered
+
+    # =========================================================================
+    # Main Entry Points
+    # =========================================================================
+
+    def create_dispatch_func(self, code, function_informations):
+        """Create dispatch function - always use C++ launcher."""
+        return self.create_dispatch_func_cpp_launcher(code, function_informations)
+
+    def create_dispatch_func_cpp_launcher(self, code, function_informations):
+        """Create dispatch function using C++ launcher."""
+        function_args, buffer_args = self._collect_function_args()
+
+        # Process each kernel and collect metadata
+        kernel_metadata = []
+        all_desc_names_union = []
+        all_tma_tensors_union = []
+
+        for function_name, function_info in function_informations.items():
+            declaration = extract_python_func_declaration(code, function_name)
+            desc_name_map: dict[str, str] = {}
+            desc_name_var_map: dict[str, tvm.tir.Var] = {}
+            call_args = self._extract_func_call_args(
+                declaration,
+                function_args,
+                function_info["function_params"],
+                desc_name_map,
+                desc_name_var_map,
+            )
+
+            tma_desc_code_map = {}
+            desc_names = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map, tma_desc_code_map)
+
+            tma_tensor_args, _ = self._process_tma_descriptors(desc_names)
+
+            kernel_metadata.append(
+                {
+                    "function_name": function_name,
+                    "function_info": function_info,
+                    "call_args": call_args,
+                    "desc_names": desc_names,
+                    "tma_tensor_args": tma_tensor_args,
+                    "desc_name_map": desc_name_map,
+                }
+            )
+
+            for desc in desc_names:
+                if desc not in all_desc_names_union:
+                    all_desc_names_union.append(desc)
+            for t in tma_tensor_args:
+                if t not in all_tma_tensors_union:
+                    all_tma_tensors_union.append(t)
+
+        # Process all TMA descriptors
+        all_tma_tensors, tensor_arg_map = self._process_tma_descriptors(all_desc_names_union)
+
+        # Generate C++ launcher
+        launcher_cpp_code = self._generate_cpp_launcher(
+            kernel_metadata, function_args, all_tma_tensors, all_desc_names_union, tensor_arg_map
+        )
+
+        self.launcher_cpp_code = launcher_cpp_code
+        # Use a deterministic name so that:
+        # 1) the generated kernel.py can always locate the launcher in the same directory
+        # 2) KernelCache can store it under a stable filename
+        self._launcher_lib_name = "launcher_lib.so"
+        self.launcher_lib_name = self._launcher_lib_name
+
+        # Generate cubin generation code (includes lib_code with @cute.kernel definitions)
+        cubin_gen_code = self._generate_cubin_gen_code(
+            kernel_metadata, buffer_args, all_desc_names_union, lib_code=getattr(self, "lib_code", "")
+        )
+
+        # Generate Python wrapper
+        buffer_names = [arg["name"] for arg in function_args if arg["type"] == "buffer"]
+        # Cubin generation may reference scalar args (e.g., dynamic symbols like m/n/k)
+        # inside `kernel_wrapper` and `cute.compile(...)`. They must be visible in
+        # `_generate_cubin_if_needed(...)` scope, so include them in its signature.
+        scalar_names = [arg["name"] for arg in function_args if arg["type"] != "buffer"]
+        cubin_gen_params = ", ".join(buffer_names + scalar_names)
+
+        python_wrapper = self._generate_python_wrapper(function_args, cubin_gen_code, cubin_gen_params)
+
+        return python_wrapper
+
+    def get_launcher_cpp_code(self) -> str:
+        """Get the generated C++ launcher code."""
+        return getattr(self, "launcher_cpp_code", "")
+
+    def update_lib_code(self, code: str):
+        """Update the library code with the given code string."""
+        self.lib_code = code
+
+        function_informations = {}
+        for function_name in self.function_names:
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 7857872cf..c5dd64bfa 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,6 +1,6 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
 
+from __future__ import annotations
 import ctypes
 import logging
 import torch
@@ -49,9 +49,9 @@ class CythonKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Maps pointer arguments to their corresponding (buffer_index, shape_dimension)
@@ -71,17 +71,19 @@ class CythonKernelAdapter(BaseKernelAdapter):
     # Pass configs for the compiler
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -93,7 +95,7 @@ def __init__(self,
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -122,16 +124,16 @@ def __init__(self,
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
+        self.host_kernel_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
 
-        self.lib_generator.update_lib_code(self.wrapped_source)
+        self.lib_generator.update_lib_code(self.host_kernel_source)
         self.lib_generator.compile_lib()
         self.lib = self.lib_generator.load_lib()
 
         self.lib.get_last_error.restype = ctypes.c_char_p
         result = self.lib.init()
         if result != 0:
-            error_msg = self.lib.get_last_error().decode('utf-8')
+            error_msg = self.lib.get_last_error().decode("utf-8")
             error_msg += f"\n{self.lib_code}"
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
@@ -146,21 +148,24 @@ def __init__(self,
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -190,11 +195,10 @@ def from_database(cls,
         adapter.lib.get_last_error.restype = ctypes.c_char_p
         result = adapter.lib.init()
         if result != 0:
-            error_msg = adapter.lib.get_last_error().decode('utf-8')
+            error_msg = adapter.lib.get_last_error().decode("utf-8")
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
-        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params,
-                                                     adapter.lib)
+        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params, adapter.lib)
         adapter.cython_wrapper.set_dynamic_symbolic_map(adapter.dynamic_symbolic_map)
         adapter.cython_wrapper.set_buffer_dtype_map(adapter.buffer_dtype_map)
         adapter.cython_wrapper.set_static_shape_map(adapter.static_shape_map)
@@ -221,15 +225,13 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -259,14 +261,13 @@ def _process_ptr_map(self) -> dict[int, str]:
         params = func.params
         ptr_map = {}
         for i, param in enumerate(params):
-            if param.dtype == 'handle':
+            if param.dtype == "handle":
                 ptr_map[i] = param.name
         return ptr_map
 
-    def _process_static_buffer_infos(self) -> \
-            tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  list[tuple[tir.Var]]]:
+    def _process_static_buffer_infos(
+        self,
+    ) -> tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]], dict[tir.Var, tuple[int, list[tuple[int, int]]]], list[tuple[tir.Var]]]:
         """Extract information about static shapes from the TIR function.
 
         Maps buffer variables to their corresponding static shapes.
@@ -332,9 +333,7 @@ def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -349,9 +348,7 @@ def lambda_forward(*args, stream: int = -1, skip_tensor_validation: bool = False
                 skip_tensor_validation: Whether to skip tensor attributes validation which
                 includes shape, dtype, device, etc.
             """
-            return self.cython_wrapper.forward([*args],
-                                               stream=stream,
-                                               skip_tensor_validation=skip_tensor_validation)
+            return self.cython_wrapper.forward([*args], stream=stream, skip_tensor_validation=skip_tensor_validation)
 
         return lambda_forward
 
@@ -383,7 +380,12 @@ def is_dynamic(self):
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            assert self.host_kernel_source is not None, "Wrapped source is not available"
+            return self.host_kernel_source
+
+    def get_host_source(self):
+        """Returns the source code of the host function."""
+        return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index f17bfffc0..38c1738f7 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -32,7 +32,8 @@ cdef class CythonKernelWrapper:
         self.params = params
         self.lib = lib
         # Convert TVM types to native Python types during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         # Convert TVM shape arrays to native Python lists
         self.param_shapes = []
         self.get_current_device = torch.cuda.current_device
@@ -82,8 +83,8 @@ cdef class CythonKernelWrapper:
                 tensor_device = tensor.device
                 device_type_match = device.type == tensor_device.type
                 device_index_match = (
-                    tensor_device.index is None or 
-                    device.index is None or 
+                    tensor_device.index is None or
+                    device.index is None or
                     tensor_device.index == device.index
                 )
                 if not (device_type_match and device_index_match):
@@ -115,7 +116,7 @@ cdef class CythonKernelWrapper:
                     f"expected {len(shape_list)} dimensions, "
                     f"got {tensor.dim()}"
                 )
-                
+
             # Check each dimension
             for shape_idx, expected_shape in shape_list:
                 actual_shape = tensor.shape[shape_idx]
@@ -175,7 +176,7 @@ cdef class CythonKernelWrapper:
             )
 
         # Use current CUDA stream if none specified
-        if stream == -1: 
+        if stream == -1:
             if torch.cuda.is_available():
                 try:
                     stream = torch._C._cuda_getCurrentRawStream(torch.cuda.current_device())
@@ -238,7 +239,7 @@ cdef class CythonKernelWrapper:
             torch.int64: ctypes.c_int64,
             torch.bool: ctypes.c_bool,
         }
-        
+
         call_args = []
         for i, tensor in enumerate(tensor_list):
             if isinstance(tensor, torch.Tensor):
@@ -267,9 +268,9 @@ cdef class CythonKernelWrapper:
         # Add dynamic dimension values to kernel arguments
         for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
             if ref_id == 0:
-                call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].shape[shape_idx]))
             else:
-                call_args.append(tensor_list[buffer_idx].stride(shape_idx))
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx)))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
@@ -285,4 +286,3 @@ cdef class CythonKernelWrapper:
             return tensor_list[self.result_idx[0]]
         else:
             return [tensor_list[i] for i in self.result_idx]
-    
diff --git a/tilelang/jit/adapter/cython/kernel_cache.py b/tilelang/jit/adapter/cython/kernel_cache.py
new file mode 100644
index 000000000..0b3d3ddf7
--- /dev/null
+++ b/tilelang/jit/adapter/cython/kernel_cache.py
@@ -0,0 +1,4 @@
+from tilelang.cache.kernel_cache import KernelCache
+
+
+class CythonKernelCache(KernelCache): ...
diff --git a/tilelang/jit/adapter/dlpack.py b/tilelang/jit/adapter/dlpack.py
deleted file mode 100644
index 9fa767f04..000000000
--- a/tilelang/jit/adapter/dlpack.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""The profiler and convert to torch utils"""
-from __future__ import annotations
-
-import torch
-from tilelang.contrib.dlpack import to_pytorch_func
-from .base import BaseKernelAdapter
-
-
-class TorchDLPackKernelAdapter(BaseKernelAdapter):
-
-    def _convert_torch_func(self) -> callable:
-        torch_func = to_pytorch_func(self.mod)
-
-        def func(*ins: list[torch.Tensor]):
-            if len(ins) + len(self.result_idx) != len(self.params):
-                raise ValueError(
-                    f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
-                )
-            ins_idx = 0
-            args = []
-
-            # use the device of the first input tensor if available
-            device = ins[0].device if len(ins) > 0 else torch.cuda.current_device()
-
-            for i in range(len(self.params)):
-                if i in self.result_idx:
-                    dtype = self.params[i].dtype
-                    shape = list(map(int, self.params[i].shape))
-                    tensor = torch.empty(*shape, dtype=dtype, device=device)
-                else:
-                    tensor = ins[ins_idx]
-                    ins_idx += 1
-                args.append(tensor)
-
-            torch_func(*args)
-
-            if len(self.result_idx) == 1:
-                return args[self.result_idx[0]]
-            else:
-                return [args[i] for i in self.result_idx]
-
-        return func
diff --git a/tilelang/jit/adapter/kernel_cache.py b/tilelang/jit/adapter/kernel_cache.py
new file mode 100644
index 000000000..0a6a525da
--- /dev/null
+++ b/tilelang/jit/adapter/kernel_cache.py
@@ -0,0 +1,21 @@
+import os
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class TVMFFIKernelCache(KernelCache):
+    kernel_lib_path = "executable.so"
+
+    def _save_wrapper_kernel_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+        KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_host_source()))
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        executable = kernel.adapter.executable
+        if verbose:
+            self.logger.debug(f"Saving kernel executable to file: {executable}")
+        KernelCache._safe_write_executable(executable, kernel_lib_path)
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index 1e33ec040..20c56bbd6 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 import ctypes
-import importlib
 import logging
 import os
-import os.path as osp
 import subprocess
 import tempfile
 from typing import Any
@@ -15,20 +13,11 @@
 from tilelang.contrib.nvcc import get_nvcc_compiler, get_target_arch, get_target_compute_version
 from tilelang.contrib.rocm import find_rocm_path, get_rocm_arch
 from tilelang.env import TILELANG_TEMPLATE_PATH
-from tilelang.utils.deprecated import deprecated_warning
 
 from .utils import is_cpu_target, is_cuda_target, is_hip_target
 
 logger = logging.getLogger(__name__)
 
-try:
-    from tilelang.jit.adapter.nvrtc import is_nvrtc_available
-    if is_nvrtc_available:
-        import cuda.bindings.driver as cuda
-        from tilelang.contrib.nvrtc import compile_cuda
-except ImportError:
-    is_nvrtc_available = False
-
 
 class LibraryGenerator:
     srcpath: str | None = None
@@ -65,25 +54,15 @@ def compile_lib(self, timeout: float = None):
         verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import CUTLASS_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
             target_arch = get_target_arch(get_target_compute_version(target))
             libpath = src.name.replace(".cu", ".so")
 
-            if self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH):
-                deprecated_warning(
-                    "TL_DISABLE_FAST_MATH",
-                    "TL_ENABLE_FAST_MATH",
-                    "0.1.7",
-                )
-                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH,
-                                                             True)
-            else:
-                enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
-
-            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL,
-                                                      None)
-            verbose_ptxas_output = self.pass_configs.get(
-                PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
+            enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
+
+            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+            verbose_ptxas_output = self.pass_configs.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
             command = [
                 get_nvcc_compiler(),
@@ -112,6 +91,7 @@ def compile_lib(self, timeout: float = None):
 
         elif is_hip_target(target):
             from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
             rocm_path = find_rocm_path()
@@ -129,6 +109,7 @@ def compile_lib(self, timeout: float = None):
             ]
         elif is_cpu_target(target):
             from tilelang.contrib.cc import get_cplus_compiler
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
 
@@ -144,9 +125,7 @@ def compile_lib(self, timeout: float = None):
         ]
 
         if self.compile_flags:
-            command += [
-                item for flag in self.compile_flags for item in flag.split() if item not in command
-            ]
+            command += [item for flag in self.compile_flags for item in flag.split() if item not in command]
 
         command += ["-o", libpath]
 
@@ -161,8 +140,7 @@ def compile_lib(self, timeout: float = None):
             raise RuntimeError(f"Compile kernel failed because of {e}") from e
 
         if ret.returncode != 0:
-            raise RuntimeError(f"Compilation Failed! {command}"
-                               f"\n {self.lib_code}")
+            raise RuntimeError(f"Compilation Failed! {command}\n {self.lib_code}")
 
         self.srcpath = src.name
         self.libpath = libpath
@@ -183,95 +161,3 @@ def set_lib_path(self, libpath):
 
     def set_src_path(self, srcpath):
         self.srcpath = srcpath
-
-
-class PyLibraryGenerator(LibraryGenerator):
-    host_func: str | None = None
-    culib = None
-    pymodule = None
-
-    def __init__(self, target: Target, verbose: bool = False):
-        if not is_nvrtc_available:
-            raise ImportError("cuda-python is not available, nvrtc backend cannot be used. "
-                              "Please install cuda-python via `pip install cuda-python` "
-                              "if you want to use the nvrtc backend.")
-        super().__init__(target, verbose)
-
-    @staticmethod
-    def import_from_file(module_name, file_path):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-    def update_host_func(self, host_func: str):
-        self.host_func = host_func
-
-    def load_lib(self, lib_path: str | None = None):
-        if lib_path is None:
-            lib_path = self.libpath
-
-        pypath = lib_path.replace(".cubin", ".py")
-        self.pymodule = self.import_from_file("kernel", pypath)
-
-        # Ensure the context is valid
-        ctx = cuda.cuCtxGetCurrent()[1]
-        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
-            import torch
-            torch.cuda.synchronize()
-
-        result, self.culib = cuda.cuLibraryLoadFromFile(
-            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
-        assert result == cuda.CUresult.CUDA_SUCCESS, f"Failed to load library: {lib_path}"
-
-    def compile_lib(self, timeout: float = None):
-        target = self.target
-        verbose = self.verbose
-        if is_cuda_target(target):
-            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
-            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
-            libpath = src.name.replace(".cu", ".cubin")
-
-            project_root = osp.join(osp.dirname(__file__), "..", "..")
-            if CUTLASS_INCLUDE_DIR is None:
-                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-            else:
-                cutlass_path = CUTLASS_INCLUDE_DIR
-
-            if TILELANG_TEMPLATE_PATH is None:
-                tl_template_path = osp.abspath(osp.join(project_root, "src"))
-            else:
-                tl_template_path = TILELANG_TEMPLATE_PATH
-
-            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
-
-            options = [f"-I{tl_template_path}", f"-I{cutlass_path}", f"-I{cuda_home}/include"]
-            if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
-
-            cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=verbose)
-            with open(libpath, "wb") as f:
-                f.write(cubin_bytes)
-
-            src.write(self.lib_code)
-            src.flush()
-
-            self.srcpath = src.name
-            self.libpath = libpath
-
-            pypath = src.name.replace(".cu", ".py")
-            with open(pypath, "w") as f:
-                f.write(self.host_func)
-        else:
-            raise ValueError(f"Unsupported target: {target}")
-
-    def __del__(self):
-        if self.culib:
-            result = cuda.cuLibraryUnload(self.culib)[0]
-            if result != cuda.CUresult.CUDA_SUCCESS:
-                logger.warning(f"Failed to unload library: {self.libpath}")
-            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/__init__.py b/tilelang/jit/adapter/nvrtc/__init__.py
index c9068fafd..c8abe8d77 100644
--- a/tilelang/jit/adapter/nvrtc/__init__.py
+++ b/tilelang/jit/adapter/nvrtc/__init__.py
@@ -5,19 +5,22 @@
 
 import logging
 
-__all__ = ['NVRTCKernelAdapter', 'is_nvrtc_available', 'check_nvrtc_available']
+__all__ = ["NVRTCKernelAdapter", "TLNVRTCSourceWrapper", "NVRTCLibraryGenerator", "is_nvrtc_available", "check_nvrtc_available"]
 
 logger = logging.getLogger(__name__)
 
 # Check if cuda-python is available
 is_nvrtc_available = False
-NVRTC_UNAVAILABLE_MESSAGE = ("cuda-python is not available, NVRTC backend cannot be used. "
-                             "Please install cuda-python via `pip install cuda-python` "
-                             "if you want to use the NVRTC backend.")
+NVRTC_UNAVAILABLE_MESSAGE = (
+    "cuda-python is not available, NVRTC backend cannot be used. "
+    "Please install cuda-python via `pip install cuda-python` "
+    "if you want to use the NVRTC backend."
+)
 
 try:
     import cuda.bindings.driver as cuda  # noqa: F401
     import cuda.bindings.nvrtc as nvrtc  # noqa: F401
+
     is_nvrtc_available = True
 except ImportError as e:
     logger.debug(f"cuda-python import failed: {e}")
@@ -37,7 +40,9 @@ def check_nvrtc_available():
 
 # Conditionally import the adapter
 if is_nvrtc_available:
-    from .adapter import NVRTCKernelAdapter  # noqa: F401
+    from .adapter import NVRTCKernelAdapter
+    from .wrapper import TLNVRTCSourceWrapper
+    from .libgen import NVRTCLibraryGenerator
 else:
     # Provide a dummy class that raises error on instantiation
     class NVRTCKernelAdapter:
@@ -45,3 +50,19 @@ class NVRTCKernelAdapter:
 
         def __init__(self, *args, **kwargs):
             raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+        @classmethod
+        def from_database(cls, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class TLNVRTCSourceWrapper:
+        """Dummy TLNVRTCSourceWrapper that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class NVRTCLibraryGenerator:
+        """Dummy NVRTCLibraryGenerator that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index d6723a031..083c8f215 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -9,12 +9,13 @@
 from tilelang import tvm as tvm
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter.wrapper import TLPyWrapper
-from tilelang.jit.adapter.libgen import PyLibraryGenerator
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.utils.target import determine_target
 from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.jit.adapter.nvrtc import is_nvrtc_available, check_nvrtc_available
 
+from .libgen import NVRTCLibraryGenerator
+
 logger = logging.getLogger(__name__)
 
 # Import cuda bindings if available
@@ -26,23 +27,24 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
     pymodule = None
     kernels = {}
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
-
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         check_nvrtc_available()
 
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -50,7 +52,8 @@ def __init__(self,
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -73,10 +76,12 @@ def __init__(self,
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
 
-        self.lib_generator = PyLibraryGenerator(self.target, self.verbose)
-        self.lib_generator.update_lib_code(self.kernel_global_source)
+        self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
         self.lib_generator.compile_lib()
@@ -91,20 +96,24 @@ def __init__(self,
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[KernelParam],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -112,7 +121,8 @@ def from_database(cls,
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
@@ -130,7 +140,7 @@ def from_database(cls,
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = PyLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator = NVRTCLibraryGenerator(adapter.target, adapter.verbose)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
         adapter.pymodule = adapter.lib_generator.pymodule
@@ -166,7 +176,7 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
                     dynamic_symbolic_map[shape] = (i, j)
         return dynamic_symbolic_map
 
-    def get_kernel_source(self) -> str | None:
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
         Returns
@@ -174,11 +184,13 @@ def get_kernel_source(self) -> str | None:
         Optional[str]
             The kernel source code, or None if not available
         """
-        return self.kernel_global_source
+        if kernel_only:
+            return self.device_kernel_source
+        else:
+            return self.host_func
 
     def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
-        """Low-level function to call the compiled CUDA kernel.
-        """
+        """Low-level function to call the compiled CUDA kernel."""
         return self.pymodule.call(self.kernels, *args, stream=stream)
 
     def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None):
diff --git a/tilelang/jit/adapter/nvrtc/kernel_cache.py b/tilelang/jit/adapter/nvrtc/kernel_cache.py
new file mode 100644
index 000000000..754ab6147
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/kernel_cache.py
@@ -0,0 +1,18 @@
+import os
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class NVRTCKernelCache(KernelCache):
+    kernel_lib_path = "kernel.cubin"
+    kernel_py_path = "kernel.py"
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        src_lib_path = kernel.adapter.libpath
+        kernel_py_path = os.path.join(cache_path, self.kernel_py_path)
+        src_lib_path = src_lib_path.replace(".cubin", ".py")
+        if verbose:
+            self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+        KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+        super()._save_so_cubin_to_disk(kernel, cache_path, verbose)
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
new file mode 100644
index 000000000..f25152ddf
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -0,0 +1,241 @@
+"""NVRTC Library Generator for TileLang.
+
+Compiles CUDA kernels at runtime using NVRTC and manages resulting binaries.
+
+Why NVRTC instead of nvcc:
+- No offline compilation step, enables true JIT workflows
+- Works without CUDA toolkit installed (only requires driver)
+- Allows kernel specialization based on runtime parameters
+
+Key responsibilities:
+- Compile CUDA source to cubin using NVRTC API
+- Generate accompanying Python launcher code
+- Load compiled cubin and extract kernel handles
+- Manage library lifecycle (load/unload)
+"""
+
+from __future__ import annotations
+import importlib
+import logging
+import os.path as osp
+import platform
+import tempfile
+from types import ModuleType
+
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cuda_target
+from tilelang.jit.adapter.nvrtc import is_nvrtc_available, NVRTC_UNAVAILABLE_MESSAGE
+
+logger = logging.getLogger(__name__)
+
+if is_nvrtc_available:
+    import cuda.bindings.driver as cuda
+    from tilelang.contrib.nvrtc import compile_cuda
+else:
+    raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+
+class NVRTCLibraryGenerator(LibraryGenerator):
+    """Runtime compiler and loader for NVRTC-compiled CUDA kernels.
+
+    Lifecycle:
+        1. compile_lib(): CUDA source → cubin + Python launcher
+        2. load_lib(): cubin → loaded library + kernel handles
+        3. pymodule.call(): Execute kernels via Python launcher
+        4. __del__: Cleanup (unload library)
+
+    Why three files (cu, cubin, py):
+        - .cu: Source for debugging, kept in temp directory
+        - .cubin: Compiled binary, loaded by CUDA driver
+        - .py: Launch code, imported as Python module
+
+    Attributes:
+        host_func: Generated Python launch code (from wrapper)
+        culib: CUDA library handle (CUlibrary)
+        pymodule: Imported Python module containing call() function
+    """
+
+    host_func: str | None = None
+    culib: cuda.CUlibrary | None = None
+    pymodule: ModuleType | None = None
+    pypath: str | None = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        """Initialize NVRTC library generator.
+
+        Args:
+            target: Compilation target (must be CUDA)
+            verbose: Enable verbose compilation output
+        """
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        """Dynamically import Python module from file path.
+
+        Standard importlib pattern for loading modules outside sys.path.
+        Used to import generated .py launcher code from temp directory.
+
+        Args:
+            module_name: Name to assign to imported module
+            file_path: Absolute path to .py file
+
+        Returns:
+            Imported module object
+        """
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Failed to import module from file: {file_path}")
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        """Store generated Python launch code for later file write.
+
+        Called by adapter after wrapper generates the launch code.
+        This is the bridge between code generation and file output.
+
+        Args:
+            host_func: Python source code containing call() function
+        """
+        self.host_func = host_func
+
+    def load_lib(self, lib_path: str | None = None):
+        """Load compiled cubin and Python launcher into memory.
+
+        Why two loads:
+            1. Import Python module for launch logic
+            2. Load cubin via CUDA Driver API for kernel handles
+
+        Context synchronization: CUDA context must be current before loading.
+        If not, use torch.cuda.synchronize() to establish context.
+
+        Args:
+            lib_path: Path to .cubin file (optional, uses self.libpath if None)
+
+        Side effects:
+            - Sets self.pymodule to imported Python module
+            - Sets self.culib to CUDA library handle
+        """
+        if lib_path is None:
+            lib_path = self.libpath
+        else:
+            self.libpath = lib_path
+
+        self.pypath = lib_path.replace(".cubin", ".py")
+        self.pymodule = self.import_from_file("kernel", self.pypath)
+
+        # Ensure the context is valid
+        ctx = cuda.cuCtxGetCurrent()[1]
+        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
+            import torch
+
+            torch.cuda.synchronize()
+
+        result, self.culib = cuda.cuLibraryLoadFromFile(bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
+        if result != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"Failed to load library: {lib_path}, error: {result}")
+
+    def compile_lib(self, timeout: float | None = None):
+        """Compile CUDA source to cubin using NVRTC and write output files.
+
+        Output artifacts (all in temp directory):
+            - .cu: Source code (for debugging)
+            - .cubin: Compiled binary (for execution)
+            - .py: Python launcher (for calling kernels)
+
+        Include paths setup:
+            - TileLang templates: kernel primitives and utilities
+            - CUTLASS: optimized GEMM/tensor ops
+            - CUDA headers: driver/runtime APIs
+
+        Why architecture detection:
+            ARM64 servers (SBSA) have different header paths than x86_64.
+
+        Args:
+            timeout: Compilation timeout in seconds (currently unsupported by NVRTC compiler)
+
+        Side effects:
+            - Writes .cu, .cubin, .py files to temp directory
+            - Sets self.srcpath, self.libpath, self.pypath
+        """
+        target = self.target
+        verbose = self.verbose
+        if is_cuda_target(target):
+            from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+
+            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
+            libpath = src.name.replace(".cu", ".cubin")
+
+            project_root = osp.join(osp.dirname(__file__), "..", "..")
+            if CUTLASS_INCLUDE_DIR is None:
+                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
+            else:
+                cutlass_path = CUTLASS_INCLUDE_DIR
+
+            if TILELANG_TEMPLATE_PATH is None:
+                tl_template_path = osp.abspath(osp.join(project_root, "src"))
+            else:
+                tl_template_path = TILELANG_TEMPLATE_PATH
+
+            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
+            __CUDACC_VER_MAJOR__ = cuda.CUDA_VERSION // 1000
+
+            # Determine target architecture
+            machine = platform.machine()
+            target_arch = "sbsa-linux" if machine in ("aarch64", "arm64") else "x86_64-linux"
+
+            options = [
+                f"-I{tl_template_path}",
+                f"-I{cutlass_path}",
+                f"-I{cuda_home}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include/cccl",
+                f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
+            ]
+
+            # Add CUDA C++ standard library include path.
+            # CUDA 13+ uses the CCCL-based cuda::std layout, while older versions use the legacy path.
+            if __CUDACC_VER_MAJOR__ >= 13:
+                options += [f"-I{cuda_home}/targets/{target_arch}/include/cccl/cuda/std"]
+            else:
+                options += [f"-I{cuda_home}/targets/{target_arch}/include/cuda/std"]
+
+            if self.compile_flags:
+                options += [item for flag in self.compile_flags for item in flag.split() if item not in options]
+
+            cubin_bytes = compile_cuda(self.lib_code, target_format="cubin", options=options, verbose=verbose)
+            with open(libpath, "wb") as f:
+                f.write(cubin_bytes)
+
+            src.write(self.lib_code)
+            src.flush()
+
+            self.srcpath = src.name
+            self.libpath = libpath
+            self.pypath = src.name.replace(".cu", ".py")
+            if self.host_func is None:
+                raise RuntimeError("Host function is not set, please call update_host_func() first.")
+            with open(self.pypath, "w") as f:
+                f.write(self.host_func)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def __del__(self):
+        """Cleanup: unload CUDA library when object is destroyed.
+
+        Critical for resource management - CUDA libraries consume GPU memory.
+        Failure to unload is logged but not raised (destructor can't fail).
+
+        Why explicit unload:
+            Python GC doesn't know about GPU resources, must release manually.
+        """
+        if self.culib:
+            result = cuda.cuLibraryUnload(self.culib)[0]
+            if result != cuda.CUresult.CUDA_SUCCESS:
+                logger.warning(f"Failed to unload library: {self.libpath}")
+            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
new file mode 100644
index 000000000..07d21da0b
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -0,0 +1,606 @@
+"""NVRTC Source Wrapper for TileLang.
+
+Generates Python runtime code for launching CUDA kernels compiled via NVRTC.
+
+Why this exists:
+- NVRTC compiles kernels at runtime, needs Python launch code (not C++)
+- TMA descriptors must be initialized once per unique buffer, not per kernel
+- L2 cache policies require explicit CUDA Driver API setup/teardown
+
+Key design:
+- Two-pass generation: collect all descriptors first, then generate launches
+- Dict-based deduplication ensures TMA descriptors created only once
+- Generates pure Python using cuda.bindings.driver for zero C++ dependency
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import match_declare_kernel, pythonic_expr, parse_function_call_args, parse_tma_descriptor_args
+
+PREDEF_HOST_FUNC_PY = """
+from cuda.bindings.driver import (
+    CUtensorMapDataType,
+    CUtensorMapInterleave,
+    CUtensorMapSwizzle,
+    CUtensorMapL2promotion,
+    CUtensorMapFloatOOBfill,
+    cuTensorMapEncodeTiled,
+    cuTensorMapEncodeIm2col,
+    CUresult,
+    cuKernelSetAttribute,
+    CUfunction_attribute,
+    CUdevice,
+    CUlaunchConfig,
+    cuLaunchKernelEx,
+    cuuint64_t,
+    cuuint32_t,
+    CUkernel,
+    CUlaunchAttribute,
+    CUlaunchAttributeID,
+)
+import ctypes
+
+_function_names = {}
+
+def call({}):
+    {}
+"""
+
+TMA_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_boxDim = [{6}]
+    {0}_elementStrides = [{7}]
+    {0}_interleave = CUtensorMapInterleave({8})
+    {0}_swizzle = CUtensorMapSwizzle({9})
+    {0}_l2Promotion = CUtensorMapL2promotion({10})
+    {0}_oobFill = CUtensorMapFloatOOBfill({11})
+
+    res, {0} = cuTensorMapEncodeTiled(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_boxDim,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+TMA_IM2COL_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_elementStrides = [{6}]
+    {0}_lowerCorner = [{7}]
+    {0}_upperCorner = [{8}]
+    {0}_channelsPerPixel = {9}
+    {0}_pixelsPerColumn = {10}
+    {0}_interleave = CUtensorMapInterleave({11})
+    {0}_swizzle = CUtensorMapSwizzle({12})
+    {0}_l2Promotion = CUtensorMapL2promotion({13})
+    {0}_oobFill = CUtensorMapFloatOOBfill({14})
+
+    res, {0} = cuTensorMapEncodeIm2col(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_lowerCorner,
+        {0}_upperCorner,
+        {0}_channelsPerPixel,
+        {0}_pixelsPerColumn,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_CREATE_HANDLE_PY = """
+    from cuda.bindings.driver import (
+        CUstreamAttrValue,
+        CUstreamAttrID,
+        CUlimit,
+        CUaccessProperty,
+        cuCtxGetLimit,
+        cuCtxSetLimit,
+        cuStreamSetAttribute,
+        cuCtxResetPersistingL2Cache,
+    )
+
+    stream_attribute = CUstreamAttrValue()
+    res, init_persisting_l2_cache_size = cuCtxGetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE)
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to get L2 cache size limit: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_INIT_FUNC_PY = """
+    stream_attribute.accessPolicyWindow.hitRatio = {1}
+    stream_attribute.accessPolicyWindow.hitProp = CUaccessProperty.CU_ACCESS_PROPERTY_PERSISTING
+    stream_attribute.accessPolicyWindow.missProp = CUaccessProperty.CU_ACCESS_PROPERTY_STREAMING
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, {2})[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set L2 cache size limit: {{res}}")
+
+    stream_attribute.accessPolicyWindow.base_ptr = {0}.data_ptr()
+    stream_attribute.accessPolicyWindow.num_bytes = {2}
+
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set stream L2 access policy: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_RESET_HANDLE_PY = """
+    stream_attribute.accessPolicyWindow.num_bytes = 0
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset stream L2 access policy: {{res}}")
+
+    res = cuCtxResetPersistingL2Cache()[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset L2 cache: {{res}}")
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, init_persisting_l2_cache_size)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to restore L2 cache size limit: {{res}}")
+"""
+
+PDL_SYNC_PY = """
+    num_attrs = 1
+    attrs = [CUlaunchAttribute()]
+    attrs[0].id = CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION
+    attrs[0].value.programmaticStreamSerializationAllowed = 1
+
+    config.numAttrs = num_attrs
+    config.attrs = attrs
+"""
+
+KERNEL_LAUNCH_FUNC_PY = """
+    res = cuKernelSetAttribute(
+        CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        {7},
+        kernels["{0}"],
+        CUdevice({10})
+    )[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
+
+    config = CUlaunchConfig()
+    config.gridDimX = {1}
+    config.gridDimY = {2}
+    config.gridDimZ = {3}
+    config.blockDimX = {4}
+    config.blockDimY = {5}
+    config.blockDimZ = {6}
+    config.sharedMemBytes = {7}
+    config.hStream = stream
+    {11}
+
+    arg_values = {8}
+    arg_types = {9}
+
+    res = cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
+"""
+
+
+class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
+    """NVRTC backend wrapper: generates Python kernel launch code.
+
+    Core responsibility: transform TVM IRModule into executable Python function
+    that initializes resources (TMA descriptors, L2 cache) and launches kernels
+    via CUDA Driver API.
+
+    Data flow:
+        IRModule → collect kernel metadata → deduplicate resources →
+        generate Python code → executable function
+
+    Why Python generation instead of C++:
+        NVRTC workflow requires runtime compilation, Python is the natural host.
+        Using cuda.bindings.driver eliminates C++ wrapper complexity.
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "ctypes.c_float",
+        "float16": "ctypes.c_uint16",
+        "bfloat16": "ctypes.c_uint16",
+        "float8_e4m3": "ctypes.c_uint8",
+        "float8_e4m3fn": "ctypes.c_uint8",
+        "float8_e5m2": "ctypes.c_uint8",
+        "float64": "ctypes.c_double",
+        "int64": "ctypes.c_int64",
+        "int32": "ctypes.c_int32",
+        "uint32": "ctypes.c_uint32",
+        "bool": "ctypes.c_bool",
+        "int8": "ctypes.c_int8",
+        "uint8": "ctypes.c_uint8",
+        "int16": "ctypes.c_int16",
+        "uint16": "ctypes.c_uint16",
+        "uchar": "ctypes.c_uint8",
+    }
+
+    _generated_host_func: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        """Initialize NVRTC wrapper with compiled IR modules.
+
+        Args:
+            scheduled_ir_module: TVM IR after scheduling passes
+            source: Generated CUDA C++ source code
+            target: Compilation target (should be NVRTC-compatible)
+            device_mod: Device-side IR module (kernel functions)
+            host_mod: Host-side IR module (launch logic)
+            pass_configs: Optional compiler pass configurations
+        """
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string, ignoring casts.
+
+        Casts are noise in generated Python code - Python is dynamically typed.
+        """
+        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True, floor_div_op="//")
+
+    def create_dispatch_func(self, code, function_informations):
+        """Generate Python dispatch function that launches multiple CUDA kernels.
+
+        Why two-pass design:
+            Pass 1: Collect TMA descriptors from all kernels into shared dicts
+            Pass 2: Generate code - descriptors first (deduplicated), then launches
+
+            Single-pass would create duplicate descriptors for each kernel.
+            Dict naturally deduplicates by descriptor name.
+
+        Args:
+            code: CUDA C++ source containing kernel declarations
+            function_informations: Dict mapping kernel names to metadata
+                (grid/block dims, params, shared memory size)
+
+        Returns:
+            Python source code defining a call() function that:
+            1. Initializes L2 cache policies (if needed)
+            2. Creates TMA descriptors once per unique buffer
+            3. Launches each kernel with cuLaunchKernelEx
+            4. Resets L2 cache policies (if needed)
+        """
+        # Extract the set of dynamic symbolic names used in the primary function
+        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
+
+        function_args = [{"name": "kernels", "type": "dict[str, CUkernel]"}]
+        # Collect function arguments based on primary function's parameters and buffer mappings
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": "ctypes.c_void_p",
+                    }
+                )
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
+            else:
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
+        # Add dynamic symbols as integer arguments
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            if dyn_sym not in [arg["name"] for arg in function_args]:
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
+
+        function_args.append(self.get_stream_type())
+
+        # Format the function arguments for declaration
+        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
+
+        # Check if any function needs L2 Persistent Map
+        has_l2_persistent_map = False
+        for function_name, _ in function_informations.items():
+            if function_name in self.l2_persistent_map:
+                has_l2_persistent_map = True
+                break
+
+        desc_name_map: dict[str, str] = {}
+        desc_name_var_map: dict[str, tvm.tir.Var] = {}
+        device_index = 0
+        kernel_launch_code = """"""
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_CREATE_HANDLE_PY
+
+        # First pass: collect all TMA descriptors from all kernels to avoid duplication
+        kernel_info_list = []
+        for function_name, function_info in function_informations.items():
+            block_info = function_info["block_info"]
+            grid_info = function_info["grid_info"]
+            dynamic_smem_buf = function_info["dynamic_smem_buf"]
+            function_params = function_info["function_params"]
+
+            # Find the location of the global kernel function in the code
+            index = match_declare_kernel(code, function_name + "(")
+
+            # Analyze the function declaration to prepare for argument extraction
+            declaration = code[index:].split(";")[0]
+
+            # Identify the start of the function body to insert arguments
+            index = code.index("{", index)
+
+            # Transform function for NVRTC: returns (arg_value, arg_type) tuples
+            def transform_nvrtc_arg(name: str, arg_type: str):
+                if arg_type == "ctypes.c_void_p":
+                    return (f"{name}.data_ptr()", arg_type)
+                return (name, arg_type)
+
+            call_args = parse_function_call_args(
+                declaration, function_args, function_params, desc_name_map, desc_name_var_map, transform_nvrtc_arg
+            )
+
+            for arg_name, arg_type in call_args:
+                if arg_type == "ctypes.c_void_p":
+                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
+                    break
+
+            # Store kernel info for second pass
+            kernel_info_list.append(
+                {
+                    "function_name": function_name,
+                    "block_info": block_info,
+                    "grid_info": grid_info,
+                    "dynamic_smem_buf": dynamic_smem_buf,
+                    "call_args": call_args,
+                    "device_index": device_index,
+                }
+            )
+
+        # Generate TMA descriptor initialization code once for all kernels
+        kernel_launch_code += self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
+
+        # Second pass: generate kernel launch code for each kernel
+        for kernel_info in kernel_info_list:
+            function_name = kernel_info["function_name"]
+            block_info = kernel_info["block_info"]
+            grid_info = kernel_info["grid_info"]
+            dynamic_smem_buf = kernel_info["dynamic_smem_buf"]
+            call_args = kernel_info["call_args"]
+            device_index = kernel_info["device_index"]
+
+            arg_names = ", ".join([arg[0] for arg in call_args])
+            arg_types = ", ".join([arg[1] for arg in call_args])
+            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
+
+            # Generate L2 persistent map initialization for this function
+            init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
+            kernel_launch_code += init_l2_persistent_map
+
+            pdl_sync_code = self.generate_pdl_sync_code(function_name)
+
+            # Generate kernel launch code
+            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(
+                function_name,
+                self._pythonic_expr(grid_info[0]),
+                self._pythonic_expr(grid_info[1]),
+                self._pythonic_expr(grid_info[2]),
+                self._pythonic_expr(block_info[0]),
+                self._pythonic_expr(block_info[1]),
+                self._pythonic_expr(block_info[2]),
+                smem_str,
+                arg_names,
+                arg_types,
+                device_index,
+                pdl_sync_code,
+            )
+
+        # Reset L2 persistent map after all kernel execution
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE_PY
+
+        # Wrap the kernel dispatch logic in an external C function
+        host_func = PREDEF_HOST_FUNC_PY.format(repr(list(function_informations.keys())), def_args, kernel_launch_code)
+        return host_func
+
+    def generate_l2_persistent_map(self, function_name: str) -> str:
+        """Generate Python code to configure L2 cache persistence for a kernel.
+
+        L2 persistence pins frequently-accessed data in L2 cache to reduce
+        memory bandwidth. Requires explicit setup via CUDA stream attributes.
+
+        Args:
+            function_name: Kernel name to check for L2 persistence config
+
+        Returns:
+            Python code that sets stream access policy window, or empty
+            string if no L2 persistence configured for this kernel.
+        """
+        if function_name not in self.l2_persistent_map:
+            return ""
+        init_l2_persistent_map = ""
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
+            # Get persisting_l2_cache_max_size
+            from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
+            persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
+            try:
+                num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
+            except TypeError:
+                # as size_in_bytes may be a symbolic expression
+                num_bytes = persisting_l2_cache_max_size
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+
+        return init_l2_persistent_map
+
+    def generate_pdl_sync_code(self, function_name: str) -> str:
+        """
+        Generate Python code to insert PDL synchronization for a given kernel.
+        """
+        if function_name not in self.pdl_sync_map:
+            return ""
+
+        return PDL_SYNC_PY
+
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        """Generate Python code to initialize TMA descriptors.
+
+        TMA (Tensor Memory Accelerator) descriptors are opaque CUDA objects
+        that describe memory layout for async copies. Must be created on host
+        before kernel launch.
+
+        Args:
+            desc_name_map: Maps descriptor variable names to buffer names
+            desc_name_var_map: Maps descriptor names to TVM variables
+
+        Returns:
+            Python code that calls cuTensorMapEncodeTiled/Im2col for each
+            unique descriptor. Empty string if no TMA descriptors needed.
+        """
+        tma_descriptor_init = ""
+        if self.tma_descriptor_args is None:
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate Python code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.box_dim)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+            else:
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    ", ".join(params.lower_corner),
+                    ", ".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
+
+    def update_lib_code(self, code: str):
+        """Update library code and generate host dispatch function.
+
+        Entry point for code generation. Walks the host IR to extract kernel
+        call sites, matches them with device kernels, then generates Python
+        dispatch code via create_dispatch_func().
+
+        Args:
+            code: CUDA C++ source code containing compiled kernels
+
+        Returns:
+            The same code string (stored in self.lib_code). Side effect:
+            sets self.host_func to generated Python dispatcher.
+        """
+        # Update the library code with the given code string
+        self.lib_code = code
+
+        # Organize function information for code generation
+        function_informations = {}
+        for function_name in self.function_names:
+            # Do not update function with dispatch host function
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] | None = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        # Create the host function wrapper for the CUDA kernel
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
+
+    def get_stream_type(self) -> dict[str, str]:
+        """Return stream parameter spec for Python signature.
+
+        NVRTC backend uses raw int for stream handle (not cudaStream_t pointer).
+        Default to 0 (NULL stream) for convenience.
+        """
+        return {"name": "stream=0", "type": "int"}
diff --git a/tilelang/jit/adapter/torch/__init__.py b/tilelang/jit/adapter/torch/__init__.py
index 2390e3e7c..f688993d0 100644
--- a/tilelang/jit/adapter/torch/__init__.py
+++ b/tilelang/jit/adapter/torch/__init__.py
@@ -1,3 +1,3 @@
 from .metal import MetalKernelAdapter
 
-__all__ = ['MetalKernelAdapter']
+__all__ = ["MetalKernelAdapter"]
diff --git a/tilelang/jit/adapter/torch/kernel_cache.py b/tilelang/jit/adapter/torch/kernel_cache.py
new file mode 100644
index 000000000..41b125fa4
--- /dev/null
+++ b/tilelang/jit/adapter/torch/kernel_cache.py
@@ -0,0 +1,4 @@
+from tilelang.cache.kernel_cache import KernelCache
+
+
+class TorchKernelCache(KernelCache): ...
diff --git a/tilelang/jit/adapter/torch/metal.py b/tilelang/jit/adapter/torch/metal.py
index 0b1bc0098..4690cf59b 100644
--- a/tilelang/jit/adapter/torch/metal.py
+++ b/tilelang/jit/adapter/torch/metal.py
@@ -12,7 +12,6 @@
 
 
 class MetalKernelAdapter(BaseKernelAdapter):
-
     def __init__(
         self,
         params: list[KernelParam],
@@ -28,10 +27,10 @@ def __init__(
     ):
         self.kernel_global_source = kernel_global_source
         if isinstance(func_or_mod, tir.PrimFunc):
-            func_name = func_or_mod.attrs['global_symbol']
+            func_name = func_or_mod.attrs["global_symbol"]
         else:
             func_name = func_or_mod.__name__
-        self.kernel_name = func_name + '_kernel'
+        self.kernel_name = func_name + "_kernel"
         self.verbose = verbose
 
         self.block_info = [1, 1, 1]
@@ -39,7 +38,7 @@ def __init__(
 
         for var, func in device_mod.functions.items():
             assert var.name_hint == self.kernel_name
-            thread_extent = func.attrs['thread_extent']
+            thread_extent = func.attrs["thread_extent"]
             for tag, extent in thread_extent.items():
                 if "threadIdx" in tag:
                     self.block_info["xyz".index(tag[-1])] = extent
@@ -47,7 +46,7 @@ def __init__(
                     self.grid_info["xyz".index(tag[-1])] = extent
             break
         else:
-            raise AssertionError(f'no kernel with name {func_name}')
+            raise AssertionError(f"no kernel with name {func_name}")
 
         # print(self.block_info, self.grid_info)
         super().__init__(func_or_mod, result_idx=result_idx, params=params)
@@ -55,15 +54,12 @@ def __init__(
     _kernel = None
 
     def _convert_torch_func(self) -> Callable:
-
         if self._kernel is None:
-
             _kernel = getattr(torch.mps.compile_shader(self.kernel_global_source), self.kernel_name)
             _threads = [x * y for (x, y) in zip(self.block_info, self.grid_info)]
 
             @wraps(_kernel)
             def launcher(*args: torch.Tensor):
-
                 return _kernel(
                     *args,
                     threads=_threads,
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
new file mode 100644
index 000000000..631479544
--- /dev/null
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -0,0 +1,312 @@
+"""Utilities to adapt TVM FFI kernels to Torch tensors.
+
+This adapter intentionally captures PyTorch's current CUDA stream and device
+via light-weight callables so that, when the wrapped function is invoked,
+the execution observes the same stream context as the active Torch code.
+On non-CUDA builds, the stream/device fall back to 0/CPU semantics.
+"""
+
+from __future__ import annotations
+
+from typing import Callable, Any
+import sys
+
+import torch
+from tilelang import tvm
+from tvm import runtime, tir
+from tvm.target import Target
+from tvm.relax import TensorType
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.engine.param import KernelParam
+from tilelang.language.dtypes import dtype
+
+
+COMPILE_ARGS = {}
+
+if sys.platform == "darwin":
+    from torch.utils import cpp_extension
+
+    COMPILE_ARGS["options"] = ["-x", "objective-c++", "-g", "-std=gnu++17"] + ["-I" + i for i in cpp_extension.include_paths()]
+
+
+class TVMFFIKernelAdapter(BaseKernelAdapter):
+    """Adapter that runs a TVM runtime.Executable with Torch tensors.
+
+    Notes
+    - We capture the "current" PyTorch CUDA stream/device as thunks (callables)
+      rather than materializing them at construction time. This ensures the
+      actual stream/device is read just-in-time when the function runs, matching
+      the user's current Torch context (e.g., after a stream guard/switch).
+    - The stream pointer returned is a raw CUDA stream handle compatible with
+      TVM's device API; on CPU or when CUDA is unavailable, we return 0.
+    """
+
+    # Class attributes to store compiled kernel information
+    target: str | Target = "cuda"
+    ir_module: tvm.IRModule | None = None
+    # The global source code of the kernel -> global means the source code of the kernel
+    # that is not wrapped by the wrapper code
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
+    executable: tvm.runtime.Executable | None = None
+    # Pass configs for the compiler
+    pass_configs: dict[str, Any] | None = None
+    # host_mod
+    host_mod: tvm.IRModule | None = None
+    # device_mod
+    device_mod: tvm.IRModule | None = None
+    # rt_mod
+    rt_mod: tvm.runtime.Module | None = None
+    # Maps symbolic variables to their corresponding buffer and shape indices
+    dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] | None = None
+
+    # Stream/device functors are inherited from BaseKernelAdapter
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        rt_mod: tvm.runtime.Module | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        """Initialize the adapter with the given TIR function or module.
+
+        Args:
+            params: List of tensor types for inputs/outputs
+            result_idx: Indices of output tensors
+            target: Target platform (e.g., 'cuda')
+            func_or_mod: TIR function or module to be compiled
+            verbose: Enable verbose logging
+        """
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        self.target = Target.canon_target(determine_target(target))
+
+        self.host_mod = host_mod
+        self.device_mod = device_mod
+        self.rt_mod = rt_mod
+        self.verbose = verbose
+        self.pass_configs = pass_configs
+        self.compile_flags = compile_flags
+        self.dynamic_symbolic_map = self._process_dynamic_symbolic()
+        self.kernel_global_source = self.device_kernel_source
+
+        self._post_init()
+
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
+        """Extract information about dynamic shapes from the TIR function.
+
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map = {}
+        for i, param in enumerate(params):
+            if isinstance(param, tir.Var) and (param not in dynamic_symbolic_map):
+                dynamic_symbolic_map[param] = (2, i, -1)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, shape in enumerate(buffer.shape):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
+                        dynamic_symbolic_map[stride] = (1, i, j)
+        return dynamic_symbolic_map
+
+    def _convert_torch_func(self) -> Callable[..., Any]:
+        # Capture thunks that reflect Torch's current stream and device.
+        # These are evaluated at call time to align TVM execution with the
+        # caller's active PyTorch stream/device.
+        # current_stream_functor = self.get_current_stream_functor()
+        current_device_functor = self.get_current_device_functor()
+
+        # Convert TVM types to native Python types during initialization
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        param_dtypes = [param.torch_dtype() for param in self.params]
+        # Convert TVM shape arrays to native Python lists
+        param_shapes = []
+
+        for param in self.params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
+                else:
+                    native_shape.append(dim)
+            tl_dtype = param.dtype
+            if tl_dtype.bits < 8:
+                stroage_dtype: dtype = dtype(param.torch_dtype())
+                # last dim divide by bits to get the actual shape
+                native_shape[-1] = native_shape[-1] * tl_dtype.bits * tl_dtype.lanes // (stroage_dtype.bits * stroage_dtype.lanes)
+            param_shapes.append(native_shape)
+
+        if self.executable is None:
+            self.executable = runtime.Executable(self.rt_mod)
+            if COMPILE_ARGS:
+                # Precompile jit module with extra arguments
+                self.executable.jit(**COMPILE_ARGS)
+
+        dynamic_symbolic_map = self._process_dynamic_symbolic()
+        executable = self.executable
+
+        # Prepare helpers for friendly dtype error messages
+        prim_func = self.prim_func
+        buffer_map = prim_func.buffer_map
+        params = prim_func.params
+        # Expected dtype string per parameter index (for buffers only)
+        expected_dtype_strs: list[str | None] = []
+        # Track whether each param is a buffer (has dtype) vs scalar
+        is_buffer_param: list[bool] = []
+        for p in params:
+            if p in buffer_map:
+                expected_dtype_strs.append(str(buffer_map[p].dtype))
+                is_buffer_param.append(True)
+            else:
+                expected_dtype_strs.append(None)
+                is_buffer_param.append(False)
+
+        def func(*inputs: torch.Tensor | Any):
+            # Validate input count strictly
+            expected_inputs = len(self.params) - len(self.result_idx)
+            if len(inputs) != expected_inputs:
+                raise ValueError(f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
+
+            # Resolve the device used for outputs. Prefer the first tensor input's device
+            # if available, otherwise use PyTorch's current device.
+            out_device: torch.device | None = None
+
+            # Stitch the full positional argument list expected by the TVM executable
+            ins_idx: int = 0
+            tensor_list: list[torch.Tensor] = []
+
+            # Prepare input and output tensors
+            for i in range(len(self.params)):
+                if i in self.result_idx:
+                    dtype = param_dtypes[i]
+                    shape = []
+                    # Now working with native Python list, no FFI calls needed
+                    for s in param_shapes[i]:
+                        if isinstance(s, tir.Var):
+                            for key in dynamic_symbolic_map:
+                                if str(s) == str(key):
+                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[key]
+                                    if ref_id == 2:
+                                        shape.append(inputs[ref_tensor_idx])
+                                    elif ref_id == 0:
+                                        shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                    elif ref_id == 1:
+                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
+                        else:  # Already converted to Python int during initialization
+                            shape.append(s)
+
+                    if out_device is None:
+                        out_device = current_device_functor()
+
+                    if len(shape) == 0:
+                        param_name = self.params[i].name if hasattr(self.params[i], "name") else f"parameter_{i}"
+                        raise ValueError(
+                            f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
+                            f"Expected shape: {shape}"
+                        )
+                    tensor = torch.empty(*shape, dtype=dtype, device=out_device)
+                else:
+                    tensor = inputs[ins_idx]
+                    ins_idx += 1
+                tensor_list.append(tensor)
+
+            executable(*tensor_list)
+
+            # Return outputs in the requested form
+            if len(self.result_idx) == 1:
+                return tensor_list[self.result_idx[0]]
+            return [tensor_list[i] for i in self.result_idx]
+
+        return func
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
+        adapter.pass_configs = pass_configs
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        target = determine_target(target, return_object=True)
+        adapter.target = Target.canon_target(determine_target(target))
+
+        adapter.verbose = verbose
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = device_kernel_source
+        adapter.executable = runtime.load_module(kernel_lib_path)
+        adapter._post_init()
+        return adapter
+
+    def get_host_source(self):
+        """Returns the source code of the host module."""
+        if self.host_kernel_source is not None:
+            return self.host_kernel_source
+        return self.rt_mod.inspect_source()
+
+    def get_device_source(self):
+        """Returns the source code of the device module."""
+        if self.device_kernel_source is not None:
+            return self.device_kernel_source
+        return self.rt_mod.imports[0].inspect_source()
+
+    def get_kernel_source(self, kernel_only: bool = False):
+        """Returns the source code of the compiled kernel."""
+        if kernel_only:
+            return self.get_device_source()
+        else:
+            return self.get_device_source() + "\n\n" + self.get_host_source()
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index efc965e1b..d43adf840 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Literal
+from typing import Literal, Callable, Any
 from tilelang import tvm as tvm
 from tvm import IRModule, tir
 from tvm.target import Target
@@ -38,6 +38,53 @@ def match_declare_kernel(source: str, annotation: str = "__global__") -> int:
     raise ValueError("No global kernel found in the source code")
 
 
+def match_declare_kernel_cutedsl(source: str, annotation: str = "@cute.kernel") -> int:
+    # Match decorator followed by function definition across lines
+    # \s+ allows any whitespace including newlines between decorator and def
+    pattern = r"@cute\.kernel\s+def\s+(\w+)"
+    matched = re.search(pattern, source, re.MULTILINE)
+    if matched:
+        # Find the position of the opening parenthesis after the function name
+        # matched.start(1) gives position of function name
+        func_name_pos = matched.start(1)
+        # Find the '(' after function name
+        paren_pos = source.find("(", func_name_pos)
+        if paren_pos != -1:
+            return paren_pos
+    raise ValueError("No global kernel found in the source code")
+
+
+def extract_python_func_declaration(source: str, func_name: str) -> str:
+    """Extract the full Python function declaration from decorator to colon.
+
+    Args:
+        source: Source code containing the function
+        func_name: Name of the function to extract (can include '(' suffix)
+
+    Returns:
+        The function declaration from 'def' to ':', including parameters
+
+    Example:
+        For code:
+            @cute.kernel
+            def kernel(arg1: cute.Tensor, arg2: int):
+                ...
+        Returns: "def kernel(arg1: cute.Tensor, arg2: int)"
+    """
+    # Remove '(' suffix if present
+    if func_name.endswith("("):
+        func_name = func_name[:-1]
+
+    # Match from def to the closing ) followed by :
+    # This handles multi-line function signatures
+    pattern = rf"def\s+{re.escape(func_name)}\s*\([^)]*\)"
+    matched = re.search(pattern, source, re.DOTALL)
+    if matched:
+        return matched.group(0)
+
+    raise ValueError(f"No function declaration found for {func_name}")
+
+
 def match_declare_kernel_cpu(source: str, annotation: str = "int32_t") -> int:
     pattern = r"int32_t\s+\w+"
     for line in source.split("\n"):
@@ -64,13 +111,16 @@ def is_metal_target(target: Target) -> bool:
     return target.kind.name == "metal"
 
 
+def is_cutedsl_target(target: Target) -> bool:
+    return target.kind.name == "cuda" and "cutedsl" in target.keys
+
+
 def get_annotated_mod(
     func_or_mod: tir.PrimFunc | tvm.IRModule,
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     model_type: Literal["device", "host", "all"] = "all",
 ) -> IRModule | tuple[IRModule, IRModule]:
-
     # Validate model_type early
     if model_type not in {"device", "host", "all"}:
         raise ValueError(f"Invalid model type: {model_type}")
@@ -95,25 +145,28 @@ def get_annotated_mod(
 
     # Define dispatch dictionary for different model types
     dispatch = {
-        "device":
-            lambda m: tir.transform.Filter(_is_device_call)(m),
-        "host":
-            lambda m: tir.transform.Filter(_is_host_call)(m),
-        "all":
-            lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)
-                       (m)),
+        "device": lambda m: tir.transform.Filter(_is_device_call)(m),
+        "host": lambda m: tir.transform.Filter(_is_host_call)(m),
+        "all": lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)(m)),
     }
 
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None) -> str:
+def pythonic_expr(
+    expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False, floor_div_op: str = "/"
+) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
     Args:
         expr: The TVM PrimExpr to convert.
-
+        dtype_map: A dictionary mapping data types to their string representations.
+        ignore_cast: Whether to ignore the cast operator and return the string representation of the value without the cast.
+        floor_div_op: Operator to use for tvm.tir.FloorDiv. Default '/' preserves prior
+                      behavior (suitable for generating C/C++ expressions). For generating
+                      Python code where integer division is required (e.g. grid/block),
+                      pass '//' explicitly.
     Returns:
         A string representation of the expression.
     """
@@ -158,18 +211,33 @@ def _visitor(node):
         elif isinstance(node, tvm.tir.Cast):
             # C-style cast has high precedence
             value_str, _ = node_to_result_map[node.value]
-            if dtype_map is None:
-                s = f"({node.dtype}){value_str}"
+            if ignore_cast:
+                s = value_str
             else:
-                s = f"({dtype_map[node.dtype]}){value_str}"
+                type_str = node.dtype if dtype_map is None else dtype_map[node.dtype]
+                s = f"({type_str}){value_str}"
             p = PRECEDENCE.get(type(node), ATOMIC_PRECEDENCE)
         elif isinstance(
-                node,
-            (tvm.tir.Mul, tvm.tir.FloorDiv, tvm.tir.Add, tvm.tir.Sub, tvm.tir.FloorMod, tvm.tir.LT,
-             tvm.tir.LE, tvm.tir.GT, tvm.tir.GE, tvm.tir.EQ, tvm.tir.NE, tvm.tir.And, tvm.tir.Or)):
+            node,
+            (
+                tvm.tir.Mul,
+                tvm.tir.FloorDiv,
+                tvm.tir.Add,
+                tvm.tir.Sub,
+                tvm.tir.FloorMod,
+                tvm.tir.LT,
+                tvm.tir.LE,
+                tvm.tir.GT,
+                tvm.tir.GE,
+                tvm.tir.EQ,
+                tvm.tir.NE,
+                tvm.tir.And,
+                tvm.tir.Or,
+            ),
+        ):
             op_map = {
                 tvm.tir.Mul: "*",
-                tvm.tir.FloorDiv: "/",
+                tvm.tir.FloorDiv: floor_div_op,
                 tvm.tir.Add: "+",
                 tvm.tir.Sub: "-",
                 tvm.tir.FloorMod: "%",
@@ -216,3 +284,211 @@ def _visitor(node):
     tvm.tir.stmt_functor.post_order_visit(expr, _visitor)
 
     return next(iter(node_to_result_map[expr]), "")
+
+
+def maybe_desc_name(name: str, matches: list[str], i: int, desc_name_map: dict[str, str] | None = None) -> bool:
+    """
+    Check if a parameter name corresponds to a TMA descriptor.
+
+    Args:
+        name: The parameter name to check.
+        matches: List of all matched parameter names.
+        i: Index of the current match.
+        desc_name_map: Optional mapping to store descriptor name relationships.
+
+    Returns:
+        True if the parameter is a TMA descriptor.
+    """
+    match = matches[i]
+    if not (match == name + "_desc" or match.startswith(name + "_desc_")):
+        return False
+    desc_decls = []
+    if desc_name_map is not None:
+        desc_name_map[match] = name
+    if i > 0:
+        desc_decls.append(matches[i - 1])
+    if i < len(matches) - 1:
+        desc_decls.append(matches[i + 1])
+    return any([decl == "CUtensorMap" for decl in desc_decls])
+
+
+def parse_function_call_args(
+    declaration: str,
+    function_args: list[dict[str, str]],
+    function_params: list[Any],
+    desc_name_map: dict[str, str] | None = None,
+    desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    transform_arg: Callable[[str, str], Any] | None = None,
+) -> list[Any]:
+    """
+    Parse function call arguments from a kernel declaration.
+
+    Args:
+        declaration: The kernel function declaration string.
+        function_args: List of function argument specifications.
+        function_params: List of function parameters from TVM IR.
+        desc_name_map: Optional mapping for descriptor names.
+        desc_name_var_map: Optional mapping from descriptor names to TVM variables.
+        transform_arg: Optional function to transform each argument (name, type) -> result.
+
+    Returns:
+        List of parsed call arguments.
+    """
+    pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
+    matches = re.findall(pattern, declaration)
+    call_args = []
+
+    for i, match in enumerate(matches):
+        for arg in function_args:
+            if arg["name"] == match:
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, arg["type"]))
+                else:
+                    call_args.append(match)
+            elif maybe_desc_name(arg["name"], matches, i, desc_name_map):
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, "None"))
+                else:
+                    call_args.append(match)
+                if desc_name_var_map is not None and function_params is not None:
+                    assert len(call_args) <= len(function_params), f"Too many arguments: {len(call_args)} > {len(function_params)}"
+                    desc_name_var_map[match] = function_params[len(call_args) - 1]
+
+    return call_args
+
+
+class TMADescriptorParams:
+    """Parsed TMA descriptor parameters."""
+
+    def __init__(self, handle_name: str, dtype: str, tensor_rank: int, global_address: Any, is_img2col: bool = False):
+        self.handle_name = handle_name
+        self.dtype = dtype
+        self.tensor_rank = tensor_rank
+        self.global_address = global_address
+        self.is_img2col = is_img2col
+
+        # Common fields
+        self.global_dim: list[str] = []
+        self.global_stride: list[str] = []
+        self.element_strides: list[str] = []
+        self.interleave: str = ""
+        self.swizzle: str = ""
+        self.l2_promotion: str = ""
+        self.oob_fill: str = ""
+
+        # Tiled-specific fields
+        self.box_dim: list[str] = []
+
+        # Im2col-specific fields
+        self.lower_corner: list[str] = []
+        self.upper_corner: list[str] = []
+        self.smem_box_channel: str = ""
+        self.smem_box_pixel: str = ""
+
+
+def parse_tma_descriptor_args(
+    tma_descriptor_args: dict[tvm.tir.Var, list[Any]],
+    desc_name_map: dict[str, str],
+    desc_name_var_map: dict[str, tvm.tir.Var],
+    pythonic_expr_func: Callable[[Any], str],
+) -> list[TMADescriptorParams]:
+    """
+    Parse TMA descriptor arguments into structured parameters.
+
+    Args:
+        tma_descriptor_args: Dictionary mapping TMA descriptor variables to their arguments.
+        desc_name_map: Mapping from descriptor handles to parameter names.
+        desc_name_var_map: Mapping from descriptor handles to TVM variables.
+        pythonic_expr_func: Function to convert TVM expressions to strings.
+
+    Returns:
+        List of parsed TMA descriptor parameters.
+    """
+    if not tma_descriptor_args:
+        return []
+
+    results = []
+
+    for handle_name, _ in desc_name_map.items():
+        assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
+        desc_var = desc_name_var_map[handle_name]
+
+        assert desc_var in tma_descriptor_args, f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
+        args = tma_descriptor_args[desc_var]
+
+        # Skip __tvm_tensormap_create_tiled and second element (like CUDA version)
+        if len(args) < 3:
+            raise ValueError(f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
+
+        tma_create_str, _, dtype, tensor_rank, global_address, *remaining_args = args
+
+        is_img2col = tma_create_str.value == "__tvm_tensormap_create_im2col"
+
+        # Convert basic fields
+        dtype = pythonic_expr_func(dtype)
+        tensor_rank = int(pythonic_expr_func(tensor_rank))
+
+        # Validate tensor_rank
+        if not isinstance(tensor_rank, int) or tensor_rank <= 0:
+            raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
+
+        params = TMADescriptorParams(handle_name, dtype, tensor_rank, global_address, is_img2col)
+
+        if not is_img2col:
+            # Tiled mode
+            expected_args_len = 4 * tensor_rank + 4
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.box_dim = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank]]
+
+            # Extract remaining parameters
+            try:
+                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank : 4 * tensor_rank + 4]
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError("Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)") from e
+        else:
+            # Im2col mode
+            expected_args_len = 5 * tensor_rank + 2
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.lower_corner = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank - 2]]
+            params.upper_corner = [pythonic_expr_func(i) for i in remaining_args[4 * tensor_rank - 2 : 5 * tensor_rank - 4]]
+
+            # Extract remaining parameters
+            try:
+                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = remaining_args[
+                    5 * tensor_rank - 4 : 5 * tensor_rank + 2
+                ]
+                params.smem_box_pixel = pythonic_expr_func(smem_box_pixel)
+                params.smem_box_channel = pythonic_expr_func(smem_box_channel)
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to unpack the final 6 TMA parameters "
+                    "(smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
+                ) from e
+
+        results.append(params)
+
+    return results
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index cdd0d5c7a..7723011e1 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -4,8 +4,20 @@
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
-from .utils import (is_metal_target, match_declare_kernel, match_declare_kernel_cpu, is_cuda_target,
-                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr)
+
+from .utils import (
+    is_metal_target,
+    is_cutedsl_target,
+    match_declare_kernel,
+    match_declare_kernel_cpu,
+    is_cuda_target,
+    is_hip_target,
+    is_cpu_target,
+    get_annotated_mod,
+    pythonic_expr,
+    parse_function_call_args,
+    parse_tma_descriptor_args,
+)
 import re
 import logging
 import textwrap
@@ -20,8 +32,26 @@
 """
 
 PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP = """
-    if ({1} > 65536) {{
-        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to set the allowed dynamic shared memory size for {0} to %d", {1});
+    int device_{0} = 0;
+    hipError_t dev_res_{0} = hipGetDevice(&device_{0});
+    if (dev_res_{0} != hipSuccess) {{
+        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to get HIP device for {0}: %s", hipGetErrorString(dev_res_{0}));
+        return -1;
+    }}
+    int max_smem_{0} = 0;
+    hipError_t attr_res_{0} = hipDeviceGetAttribute(&max_smem_{0}, hipDeviceAttributeMaxSharedMemoryPerBlock, device_{0});
+    if (attr_res_{0} != hipSuccess || max_smem_{0} <= 0) {{
+        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to query HIP max shared memory for {0}: %s", hipGetErrorString(attr_res_{0}));
+        return -1;
+    }}
+    if ({1} > max_smem_{0}) {{
+        snprintf(
+            error_buf,
+            ERROR_BUF_SIZE,
+            "Requested dynamic shared memory %d exceeds device limit %d for {0}",
+            {1},
+            max_smem_{0}
+        );
         return -1;
     }}
     return 0;
@@ -49,16 +79,6 @@
 }}
 """
 
-PREDEF_HOST_FUNC_PY = """
-import cuda.bindings.driver
-import ctypes
-
-_function_names = {}
-
-def call({}):
-    {}
-"""
-
 L2_PERSISTENT_MAP_CREATE_HANDLE = """
 \tcudaStreamAttrValue stream_attribute;
 \tsize_t init_persisting_l2_cache_size;
@@ -136,68 +156,24 @@ def call({}):
 \t}}
 """
 
-TMA_DESC_INIT_FUNC_PY = """
-\t{0}_type = cuda.bindings.driver.CUtensorMapDataType({1})
-\t{0}_tensorRank = {2}
-\t{0}_globalAddress = {3}.data_ptr()
-\t{0}_globalDim = [{4}]
-\t{0}_globalStride = [{5}][1:]
-\t{0}_boxDim = [{6}]
-\t{0}_elementStrides = [{7}]
-\t{0}_interleave = cuda.bindings.driver.CUtensorMapInterleave({8})
-\t{0}_swizzle = cuda.bindings.driver.CUtensorMapSwizzle({9})
-\t{0}_l2Promotion = cuda.bindings.driver.CUtensorMapL2promotion({10})
-\t{0}_oobFill = cuda.bindings.driver.CUtensorMapFloatOOBfill({11})
-
-\tres, {0} = cuda.bindings.driver.cuTensorMapEncodeTiled(
-\t\t{0}_type,
-\t\t{0}_tensorRank,
-\t\t{0}_globalAddress,
-\t\t{0}_globalDim,
-\t\t{0}_globalStride,
-\t\t{0}_boxDim,
-\t\t{0}_elementStrides,
-\t\t{0}_interleave,
-\t\t{0}_swizzle,
-\t\t{0}_l2Promotion,
-\t\t{0}_oobFill,
-\t)
-
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
-"""
-
-KERNEL_LAUNCH_FUNC_PY = """
-\tres = cuda.bindings.driver.cuKernelSetAttribute(
-\t\tcuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-\t\t{7},
-\t\tkernels["{0}"],
-\t\tcuda.bindings.driver.CUdevice({10})
-\t)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
-
-\tconfig = cuda.bindings.driver.CUlaunchConfig()
-\tconfig.gridDimX = {1}
-\tconfig.gridDimY = {2}
-\tconfig.gridDimZ = {3}
-\tconfig.blockDimX = {4}
-\tconfig.blockDimY = {5}
-\tconfig.blockDimZ = {6}
-\tconfig.sharedMemBytes = {7}
-\tconfig.hStream = stream
-
-\targ_values = {8}
-\targ_types = {9}
-
-\tres = cuda.bindings.driver.cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
+KERNEL_LAUNCH_FUNC_CODE = """
+\t{{
+\t\tcudaLaunchConfig_t config;
+\t\tcudaLaunchAttribute attribute[1];
+\t\tattribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+\t\tattribute[0].val.programmaticStreamSerializationAllowed = 1;
+\t\tconfig.attrs = attribute;
+\t\tconfig.numAttrs = 1;
+\t\tconfig.stream = stream;
+\t\tconfig.gridDim = {0};
+\t\tconfig.blockDim = {1};
+\t\tconfig.dynamicSmemBytes = {2};
+\t\tcudaLaunchKernelEx(&config, {4}, {3});
+\t}}
 """
 
 
 class BaseWrapper(ABC):
-
     @abstractmethod
     def wrap(self, *args, **kwargs):
         raise NotImplementedError
@@ -212,6 +188,7 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
@@ -230,13 +207,15 @@ class TLCUDASourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -249,13 +228,16 @@ def __init__(self,
         self.grid_info: list[int] | dict = [1, 1, 1]
         self.tma_descriptor_args: dict | None = None
         self.l2_persistent_map: dict[str, dict] | None = {}
+        self.pdl_sync_map: dict[str, int] | None = {}
         self.parse_source_information()
         self.srcpath: str | None = None
         self.libpath: str | None = None
         self.lib_code: str | None = self.update_lib_code(source)
 
     def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
-        return pythonic_expr(expr, self._TYPE_MAP)
+        # This wrapper generates C/CUDA source. C/C++ integer division uses '/',
+        # and '//' is not a valid operator in C/C++.
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="/")
 
     def _lookup_type(self, dtype: str | Any) -> str:
         key = dtype if isinstance(dtype, str) else str(dtype)
@@ -278,60 +260,26 @@ def create_dispatch_func(self, code, function_informations):
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": self._lookup_type(buffer.dtype) + "* __restrict__",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": self._lookup_type(buffer.dtype) + "* __restrict__",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "int"})
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
 
         function_args.append(self.get_stream_type())
 
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
-        def func_call_args(s,
-                           function_args,
-                           function_params,
-                           desc_name_map: dict[str, str] | None = None,
-                           desc_name_var_map: dict[str, tvm.tir.Var] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(match)
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append(match)
-                        assert len(call_args) <= len(
-                            function_params
-                        ), f"Function {function_name} has {len(function_params)} parameters, but {len(call_args)} arguments"
-                        desc_name_var_map[match] = function_params[len(call_args) - 1]
-
-            return call_args
-
         has_l2_persistent_map = False
         for function_name, _ in function_informations.items():
             if function_name in self.l2_persistent_map:
@@ -353,170 +301,136 @@ def maybe_desc(name: str, matches: list[str], i: int):
             index = match_declare_kernel(code, function_name + "(")
 
             # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
+            declaration = self.get_declaration(code[index:])
 
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
 
-            block_str = f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
-            grid_str = f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            block_str = (
+                f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
+            )
+            grid_str = (
+                f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            )
             smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
             init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
             kernel_launch_code += init_l2_persistent_map
 
             if self.use_cooperative_groups[function_name]:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 args_array = [f"(void*)&{arg}" for arg in args_list]
                 call_args = f"\tvoid* {function_name}_args[] = {{{', '.join(args_array)}}};\n"
                 kernel_launch_code += call_args
                 # Using cudaLaunchCooperativeKernel to launch the kernel
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
-                    function_name, grid_str, block_str, function_name + "_args", smem_str)
+                    function_name, grid_str, block_str, function_name + "_args", smem_str
+                )
+            elif function_name in self.pdl_sync_map:
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
+
+                call_args = ", ".join(args_list)
+
+                kernel_code = KERNEL_LAUNCH_FUNC_CODE.format(
+                    grid_str,
+                    block_str,
+                    smem_str,
+                    call_args,
+                    function_name,
+                )
+
+                kernel_launch_code += kernel_code
+                kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
+
             else:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 call_args = ", ".join(args_list)
                 kernel_launch_code += f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
-                kernel_launch_code += f"\tTILELANG_CHECK_LAST_ERROR(\"{function_name}\");\n"
+                kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
             if has_l2_persistent_map:
                 kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE
 
-        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map,
-                                                                     desc_name_var_map)
+        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
         kernel_launch_code = init_tma_descriptor_args + kernel_launch_code
 
         # Wrap the kernel dispatch logic in an external C function
         host_func = PREDEF_HOST_FUNC.format(def_args, kernel_launch_code)
         return host_func
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        return declare_kernel_code.split(";")[0]
+
     def generate_l2_persistent_map(self, function_name: str) -> str:
         if function_name not in self.l2_persistent_map:
             return ""
         init_l2_persistent_map = ""
-        for buffer_name, (hit_ratio,
-                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
             # get persisting_l2_cache_max_size
             from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
             persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
             try:
                 num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
             except Exception:
                 # as size_in_bytes maybe a symbolic expression
                 num_bytes = persisting_l2_cache_max_size
-            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(
-                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
 
         return init_l2_persistent_map
 
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
-                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
-        tma_descripter_init = ""
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        tma_descriptor_init = ""
         if self.tma_descriptor_args is None:
-            return tma_descripter_init
-        for handle_name, _ in desc_name_map.items():
-            assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
-            desc_var = desc_name_var_map[handle_name]
-
-            assert desc_var in self.tma_descriptor_args, f"TMA descriptor {desc_var} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_var]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-
-            tma_create_str, _, dtype, tensor_rank, globalAddress, *remaining_args = args
-
-            is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
-            dtype = self._pythonic_expr(dtype)
-            tensor_rank = int(self._pythonic_expr(tensor_rank))
-
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            if not is_img2col:
-                # Calculate required length for remaining_args
-                expected_args_len = 4 * tensor_rank + 4  # 4 groups of tensor_rank size + 4 parameters
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                box_dim = [self._pythonic_expr(i) for i in box_dim]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-
-                # Extract remaining parameters
-                try:
-                    interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                               tensor_rank + 4]
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(box_dim), ",".join(element_strides),
-                    interleave, swizzle, l2Promotion, oobFill)
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate C++ code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.box_dim),
+                    ",".join(params.element_strides),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
             else:
-                # Calculate required length for remaining_args
-                expected_args_len = 5 * tensor_rank + 2
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                element_strides = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                lower_corner = remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
-                upper_corner = remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-                lower_corner = [self._pythonic_expr(i) for i in lower_corner]
-                upper_corner = [self._pythonic_expr(i) for i in upper_corner]
-
-                # Extract remaining parameters
-                try:
-                    smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill = remaining_args[
-                        5 * tensor_rank - 4:5 * tensor_rank + 2]
-                    smem_box_pixel = self._pythonic_expr(smem_box_pixel)
-                    smem_box_channel = self._pythonic_expr(smem_box_channel)
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 6 TMA parameters (smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(element_strides), ",".join(lower_corner),
-                    ",".join(upper_corner), smem_box_channel, smem_box_pixel, interleave, swizzle,
-                    l2Promotion, oobFill)
-
-        return tma_descripter_init
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.element_strides),
+                    ",".join(params.lower_corner),
+                    ",".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
 
     def parse_source_information(self):
         if self.device_mod is None or self.host_mod is None:
@@ -524,9 +438,8 @@ def parse_source_information(self):
                 device_mod, host_mod = get_annotated_mod(self.mod, self.target)
             self.device_mod = device_mod
             self.host_mod = host_mod
-        assert (len(self.device_mod.functions)
-                >= 1), "Device module should have at least one function."
-        assert (len(self.host_mod.functions) == 1), "Only support one function in host module."
+        assert len(self.device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(self.host_mod.functions) == 1, "Only support one function in host module."
 
         block_info_map = {}
         grid_info_map = {}
@@ -553,6 +466,10 @@ def parse_source_information(self):
                         block_info["xyz".index(tag[-1])] = extent
                     elif "blockIdx" in tag:
                         grid_info["xyz".index(tag[-1])] = extent
+
+            if "has_cuda_pdl_sync" in attrs:
+                self.pdl_sync_map[function_name] = 0
+
             # Map the extracted configurations to each function
             block_info_map[function_name] = block_info
             grid_info_map[function_name] = grid_info
@@ -567,7 +484,8 @@ def parse_source_information(self):
         self.use_cooperative_groups = use_cooperative_groups_map
 
         function_names_index = {}
-        for _, func in self.host_mod.functions.items():
+        for g_var, func in self.host_mod.functions.items():
+            function_name = g_var.name_hint
             if "tma_descriptor_args" in func.attrs:
                 self.tma_descriptor_args = func.attrs["tma_descriptor_args"]
             if "l2_persistent_map" in func.attrs:
@@ -583,18 +501,20 @@ def parse_source_information(self):
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
 
-        def unique_push_back(name: str):
+        def unique_push_back(name: str, dtype: str):
             if name not in dynamic_symbolic_set:
-                dynamic_symbolic_set.append(name)
+                dynamic_symbolic_set[name] = dtype
+            else:
+                assert dtype == dynamic_symbolic_set[name]
 
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var):
-                        unique_push_back(dim.name)
+                        unique_push_back(dim.name, str(dim.dtype))
 
         # Note: In buffer definitions, any dynamic symbols appearing in strides are listed after those in the shape.
         for param in prim_func.params:
@@ -602,9 +522,9 @@ def unique_push_back(name: str):
                 buffer = prim_func.buffer_map[param]
                 for stride in buffer.strides:
                     if isinstance(stride, tvm.tir.Var):
-                        unique_push_back(stride.name)
+                        unique_push_back(stride.name, str(stride.dtype))
 
-        return dynamic_symbolic_set
+        return list(dynamic_symbolic_set.items())
 
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
@@ -613,8 +533,7 @@ def get_init_func(self):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -641,17 +560,14 @@ def update_lib_code(self, code: str):
             def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 nonlocal function_params
                 if isinstance(node, tvm.tir.Call):
-                    if not (hasattr(node, "op") and
-                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
                         return
                     args = node.args
                     if not args or args[0] != fn:
                         return
                     if len(args) < 1 + param_cnt:
-                        raise AssertionError(
-                            "tvm_call_packed should have at least 1 argument and match device function parameters"
-                        )
-                    function_params = args[1:1 + param_cnt]
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
 
             post_order_visit(self.host_func.body, visitor)
             assert function_params is not None, "function_params should not be None"
@@ -713,213 +629,6 @@ def host_func(self):
             raise ValueError("Cannot find primary function in the module.")
 
 
-class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
-    """
-    A wrapper class for the TileLang NVRTC backend.
-    """
-
-    _TYPE_MAP = {
-        "float32": "ctypes.c_float",
-        "float16": "ctypes.c_uint16",
-        "bfloat16": "ctypes.c_uint16",
-        "float8_e4m3": "ctypes.c_uint8",
-        "float8_e4m3fn": "ctypes.c_uint8",
-        "float8_e5m2": "ctypes.c_uint8",
-        "float64": "ctypes.c_double",
-        "int64": "ctypes.c_int64",
-        "int32": "ctypes.c_int32",
-        "uint32": "ctypes.c_uint32",
-        "bool": "ctypes.c_bool",
-        "int8": "ctypes.c_int8",
-        "uint8": "ctypes.c_uint8",
-        "int16": "ctypes.c_int16",
-        "uint16": "ctypes.c_uint16",
-        "uchar": "ctypes.c_uint8",
-    }
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
-        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
-
-    def create_dispatch_func(self, code, function_informations):
-        # Extract the set of dynamic symbolic names used in the primary function
-        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
-
-        function_args = [{"name": "kernels", "type": "Dict[str, cuda.bindings.driver.CUkernel]"}]
-        # Collect function arguments based on primary function's parameters and buffer mappings
-        for param in self.prim_func.params:
-            if param in self.prim_func.buffer_map:
-                buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": "ctypes.c_void_p",
-                })
-            elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
-            else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
-        # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
-
-        function_args.append(self.get_stream_type())
-        # Format the function arguments for declaration
-        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
-
-        def func_call_args(s, function_args, desc_name_map: dict[str, str] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(
-                            (f"{match}.data_ptr()" if arg["type"] == "ctypes.c_void_p" else match,
-                             arg["type"]))
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append((match, "None"))
-            return call_args
-
-        desc_name_map: dict[str, str] = {}
-        device_index = 0
-        kernel_launch_code = """"""
-        for function_name, function_info in function_informations.items():
-            block_info = function_info["block_info"]
-            grid_info = function_info["grid_info"]
-            dynamic_smem_buf = function_info["dynamic_smem_buf"]
-
-            # Find the location of the global kernel function in the code
-            index = match_declare_kernel(code, function_name + "(")
-
-            # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
-
-            # Identify the start of the function body to insert arguments
-            index = code.index("{", index)
-            call_args = func_call_args(declaration, function_args, desc_name_map)
-            for arg_name, arg_type in call_args:
-                if arg_type == "ctypes.c_void_p":
-                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
-                    break
-            arg_names = ", ".join([arg[0] for arg in call_args])
-            arg_types = ", ".join([arg[1] for arg in call_args])
-            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
-            kernel_launch_code += self.generate_tma_descriptor_args(
-                desc_name_map) + KERNEL_LAUNCH_FUNC_PY.format(
-                    function_name, self._pythonic_expr(grid_info[0]),
-                    self._pythonic_expr(grid_info[1]), self._pythonic_expr(grid_info[2]),
-                    self._pythonic_expr(block_info[0]), self._pythonic_expr(block_info[1]),
-                    self._pythonic_expr(
-                        block_info[2]), smem_str, arg_names, arg_types, device_index)
-
-        # Wrap the kernel dispatch logic in an external C function
-        host_func = PREDEF_HOST_FUNC_PY.format(
-            repr(list(function_informations.keys())), def_args, kernel_launch_code)
-        return host_func
-
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str]) -> str:
-        tma_descripter_init = ""
-        if self.tma_descriptor_args is None:
-            return tma_descripter_init
-
-        for handle_name, name in desc_name_map.items():
-            desc_name = name + "_desc"
-            assert desc_name in self.tma_descriptor_args, f"TMA descriptor {desc_name} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_name]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
-
-            tensor_rank = int(tensor_rank)
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            # Calculate required length for remaining_args
-            # 4 groups of tensor_rank size + 4 parameters
-            expected_args_len = 4 * tensor_rank + 4
-            if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-            # Extract dimensions and strides using list slicing
-            global_dim = remaining_args[:tensor_rank]
-            global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-            box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-            element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-            global_dim = [str(i) for i in global_dim]
-            global_stride = [str(i) for i in global_stride]
-            box_dim = [str(i) for i in box_dim]
-            element_strides = [str(i) for i in element_strides]
-
-            # Extract remaining parameters
-            try:
-                interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                           tensor_rank + 4]
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                ) from e
-
-            tma_descripter_init += TMA_DESC_INIT_FUNC_PY.format(
-                handle_name, dtype, tensor_rank, globalAddress,
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_stride)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})", box_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})",
-                              element_strides)), interleave, swizzle, l2Promotion, oobFill)
-        return tma_descripter_init
-
-    def update_lib_code(self, code: str):
-        # Update the library code with the given code string
-        self.lib_code = code
-
-        # Organize function information for code generation
-        function_informations = {}
-        for function_name in self.function_names:
-            # Do not update function with dispatch host function
-            if (function_name not in self.block_info) or (function_name not in self.grid_info):
-                continue
-
-            function_informations[function_name] = {
-                "function_name": function_name,
-                "block_info": self.block_info[function_name],
-                "grid_info": self.grid_info[function_name],
-                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
-            }
-
-        # Create the host function wrapper for the CUDA kernel
-        self.host_func = self.create_dispatch_func(code, function_informations)
-        return self.lib_code
-
-    def get_stream_type(self) -> dict[str, str]:
-        return {"name": "stream=0", "type": "int"}
-
-
 class TLHIPSourceWrapper(TLCUDASourceWrapper):
     """
     A wrapper class for the TileLang HIP backend.
@@ -932,12 +641,14 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "float8_e4m3": "fp8_e4_t",
         "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
+        "float8_e5m2fnuz": "fp8_e5_t",
         "float8_e4m3fnuz": "fp8_e4_t",
         "e4m3fnuz_float8": "fp8_e4_t",
         "float64": "double",
         "int64": "int64_t",
         "int32": "int",
         "uint32": "unsigned int",
+        "uint64": "uint64_t",
         "bool": "int8_t",
         "int8": "int8_t",
         "uint8": "uint8_t",
@@ -946,15 +657,22 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "uchar": "uint8_t",
     }
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        # HIP code dont have function declaration, so we use '{\n' to split
+        # __global__ void __launch_bounds__(128) kernel_kernel(float* __restrict__ A) {\n
+        return declare_kernel_code.split("{")[0]
+
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
         call_str = """"""
@@ -962,8 +680,7 @@ def get_init_func(self):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -1005,13 +722,15 @@ class TLCPUSourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1040,18 +759,19 @@ def create_call_func(self, code, function_informations):
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.name,
-                    "type": self._lookup_type(buffer.dtype) + "*",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.name,
+                        "type": self._lookup_type(buffer.dtype) + "*",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            function_args.append({"name": dyn_sym, "type": "int"})
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
@@ -1068,7 +788,6 @@ def func_call_args(s, function_args):
         _call_str = """"""
 
         for function_name, _ in function_informations.items():
-
             # Find the location of the global kernel function in the code
             index = match_declare_kernel_cpu(code, function_name + "(")
 
@@ -1088,8 +807,8 @@ def func_call_args(s, function_args):
     def parse_source_information(self):
         with tvm.transform.PassContext(opt_level=3, config=self.pass_configs):
             device_mod, host_mod = get_annotated_mod(self.mod, self.target)
-        assert (len(device_mod.functions) >= 1), "Device module should have at least one function."
-        assert (len(host_mod.functions) == 1), "Only support one function in host module."
+        assert len(device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(host_mod.functions) == 1, "Only support one function in host module."
 
         function_names = []
         for g_var, _ in device_mod.functions.items():
@@ -1100,14 +819,14 @@ def parse_source_information(self):
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var) and (dim.name not in dynamic_symbolic_set):
-                        dynamic_symbolic_set.append(dim.name)
-        return dynamic_symbolic_set
+                        dynamic_symbolic_set[dim.name] = str(dim.dtype)
+        return list(dynamic_symbolic_set.items())
 
     def get_cpu_init_func(self):
         # Provide init() and get_last_error() for CPU backend
@@ -1149,14 +868,15 @@ def prim_func(self):
 
 
 class TLMetalSourceWrapper:
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1170,10 +890,14 @@ def update_lib_code(self, code: str):
         return self.lib_code
 
 
+# TLCuTeDSLSourceWrapper has been moved to tilelang.jit.adapter.cutedsl.wrapper
+
+
 class TLWrapper(BaseWrapper):
     """
     A wrapper class for the TileLang backend.
     """
+
     device_mod: IRModule | None = None
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
@@ -1218,26 +942,40 @@ def wrap(self, c_source: str):
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
         return wrapper.lib_code
 
 
 class TLPyWrapper(TLWrapper):
-
     def __init__(self, target: Target):
         super().__init__(target)
 
-    def wrap(self, c_source: str):
+    def wrap(self, py_source: str):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
-        if is_cuda_target(self.target):
+        if is_cutedsl_target(self.target):
+            from tilelang.jit.adapter.cutedsl import TLCuTeDSLSourceWrapper
+
+            wrapper_class = TLCuTeDSLSourceWrapper
+        elif is_cuda_target(self.target):
+            from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
+
             wrapper_class = TLNVRTCSourceWrapper
         else:
-            raise ValueError(f"Unsupported platform: {self.arch.platform}")
+            raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
         wrapper = wrapper_class(
             scheduled_ir_module=self.scheduled_ir_module,
-            source=c_source,
+            source=py_source,
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
-        return wrapper.host_func, wrapper.function_names
+            pass_configs=self.pass_configs,
+        )
+        return {
+            "host_func": getattr(wrapper, "host_func", None),
+            "function_names": getattr(wrapper, "function_names", None),
+            "tma_cpp_init_code": getattr(wrapper, "tma_cpp_init_code", None),
+            "tma_lib_name": getattr(wrapper, "tma_lib_name", None),
+            "launcher_cpp_code": getattr(wrapper, "launcher_cpp_code", None),
+            "launcher_lib_name": getattr(wrapper, "launcher_lib_name", None),
+        }
diff --git a/tilelang/jit/exceptions.py b/tilelang/jit/exceptions.py
new file mode 100644
index 000000000..844b5edb0
--- /dev/null
+++ b/tilelang/jit/exceptions.py
@@ -0,0 +1,24 @@
+"""Custom exceptions for TileLang JIT compilation."""
+
+
+class JITNoBuilderError(Exception):
+    """
+    Exception raised when JIT-related operations require a Builder but none exists.
+
+    In eager mode, TileLang constructs AST directly without an explicit prim_func,
+    so there must be a Builder available. This error is raised when eager-only
+    features like T.const() or T.Kernel() are called outside of a JIT/prim_func context.
+    """
+
+    pass
+
+
+class EagerJITBuildError(Exception):
+    """
+    Exception raised for errors when building TileLang eager JIT kernels.
+
+    This error indicates that something went wrong during the eager-style
+    kernel construction process.
+    """
+
+    pass
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
new file mode 100644
index 000000000..985551642
--- /dev/null
+++ b/tilelang/jit/execution_backend.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from tvm.target import Target
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+# Canonical names for execution backends used internally
+_CANONICAL_MAP = {
+    "dlpack": "tvm_ffi",  # historical alias
+}
+
+
+def _canon_backend(name: str | None) -> str | None:
+    if name is None:
+        return None
+    key = str(name).lower()
+    return _CANONICAL_MAP.get(key, key)
+
+
+def _target_kind(target: Target) -> str:
+    # tvm.target.Target always has kind.name
+    return target.kind.name
+
+
+def allowed_backends_for_target(target: Target, *, include_unavailable: bool = True) -> list[str]:
+    """Return allowed execution backends for a given TVM target kind.
+
+    include_unavailable: if False, this will filter out backends that are known
+    to be unavailable at runtime (e.g., NVRTC without cuda-python installed).
+    """
+    kind = _target_kind(target)
+
+    if is_cutedsl_target(target):
+        return ["cutedsl"]
+    elif kind == "cuda":
+        allowed = ["tvm_ffi", "nvrtc", "cython"]
+    elif kind == "hip":
+        allowed = ["tvm_ffi", "cython"]
+    elif kind == "metal":
+        allowed = ["tvm_ffi", "torch"]
+    elif kind == "c":  # CPU C backend
+        allowed = ["cython", "tvm_ffi"]
+    else:
+        # Fallback: prefer portable hosts
+        allowed = ["cython", "tvm_ffi"]
+
+    if not include_unavailable:
+        # Drop NVRTC if not importable
+        try:
+            from tilelang.jit.adapter.nvrtc import is_nvrtc_available  # lazy
+
+            if not is_nvrtc_available and "nvrtc" in allowed:
+                allowed = [b for b in allowed if b != "nvrtc"]
+        except Exception:
+            # Be conservative and keep nvrtc if detection itself fails
+            pass
+
+    return allowed
+
+
+def _format_options(options: Iterable[str]) -> str:
+    return ", ".join(sorted(options))
+
+
+def resolve_execution_backend(requested: str | None, target: Target) -> str:
+    """Resolve an execution backend string to a concrete backend.
+
+    - Supports the alias "dlpack" -> "tvm_ffi".
+    - Supports the sentinel "auto" which selects a sensible default per target.
+    - Validates the combination (target, backend) and raises with helpful
+      alternatives when invalid.
+    """
+    req = _canon_backend(requested)
+    allowed_all = allowed_backends_for_target(target, include_unavailable=True)
+    allowed_avail = allowed_backends_for_target(target, include_unavailable=False)
+
+    # Default selection for auto/None
+    if req in (None, "auto"):
+        if is_cutedsl_target(target):
+            return "cutedsl"
+        kind = _target_kind(target)
+        if kind == "cuda" or kind == "metal":
+            choice = "tvm_ffi"
+        else:
+            choice = "cython"
+        # If the chosen default is not available (very rare), fall back to first available
+        if choice not in allowed_avail and allowed_avail:
+            choice = allowed_avail[0]
+        return choice
+
+    # Validate against allowed
+    if req not in allowed_all:
+        raise ValueError(
+            f"Invalid execution backend '{requested}' for target '{_target_kind(target)}'. "
+            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'."
+        )
+
+    # Promote to availability-aware set for nicer errors (e.g., nvrtc not installed)
+    if req not in allowed_avail:
+        raise ValueError(
+            f"Execution backend '{requested}' requires extra dependencies and is not available now. "
+            f"Try one of: {_format_options(allowed_avail)}."
+        )
+
+    return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index bb47716ce..e80e4b21f 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 from typing import Any, Callable, Generic, Literal, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 
-from tilelang.jit.adapter.utils import is_metal_target, is_cuda_target
+from tilelang.jit.adapter.utils import is_cutedsl_target, is_metal_target, is_cuda_target
 from tvm.target import Target
 from tvm.tir import PrimFunc
 
@@ -14,18 +15,24 @@
 from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
-from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  NVRTCKernelAdapter, TorchDLPackKernelAdapter, MetalKernelAdapter)
+from tilelang.jit.adapter import (
+    BaseKernelAdapter,
+    CythonKernelAdapter,
+    CuTeDSLKernelAdapter,
+    TVMFFIKernelAdapter,
+    MetalKernelAdapter,
+)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
+from tilelang.transform import PassConfigKey
 import logging
 import os
 
 logger = logging.getLogger(__name__)
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 class JITKernel(Generic[_P, _T]):
@@ -41,6 +48,7 @@ class JITKernel(Generic[_P, _T]):
     torch_function : Callable
         The compiled function that can be invoked as a PyTorch-compatible function.
     """
+
     prim_func: PrimFunc = None
     artifact: CompiledArtifact = None
     adapter: BaseKernelAdapter = None
@@ -55,7 +63,7 @@ def __init__(
         self,
         func: PrimFunc = None,
         out_idx: list[int] | int = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         target: str | Target = "auto",
         target_host: str | Target = None,
         verbose: bool = False,
@@ -72,8 +80,8 @@ def __init__(
             The TileLang TIR function to compile and wrap.
         out_idx : Union[List[int], int], optional
             Index(es) of the output tensors to return (default: None).
-        execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-            Execution backend to use for kernel execution (default: "cython").
+        execution_backend : Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+            Execution backend to use for kernel execution.
         target : Union[str, Target], optional
             Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host : Union[str, Target], optional
@@ -95,25 +103,23 @@ def __init__(
             pass_configs = {}
         self.pass_configs = pass_configs
 
-        self.compile_flags = compile_flags
+        self.compile_flags = [compile_flags] if isinstance(compile_flags, str) else compile_flags
 
         # Ensure the target is always a valid TVM Target object.
         self.target = determine_target(target, return_object=True)
 
         # Validate the execution backend.
         assert execution_backend in [
-            "dlpack",
-            "ctypes",
+            "tvm_ffi",
             "cython",
             "nvrtc",
             "torch",
+            "cutedsl",
         ], f"Invalid execution backend. {execution_backend}"
         if execution_backend == "cython":
             from tilelang.contrib.cc import get_cplus_compiler
 
-            assert (
-                get_cplus_compiler() is not None
-            ), "Cython backend requires a C++ compiler, please install or use other backends."
+            assert get_cplus_compiler() is not None, "Cython backend requires a C++ compiler, please install or use other backends."
 
         if from_database:
             return
@@ -143,13 +149,14 @@ def __init__(
     def from_database(
         cls,
         func: PrimFunc,
-        kernel_global_source: str,
+        host_kernel_source: str,
+        device_kernel_source: str,
         kernel_lib_path: str,
         params: list[KernelParam],
         target: str | Target,
         target_host: str | Target,
         out_idx: list[int] | int,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"],
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"],
         pass_configs: dict[str, Any] | None = None,
         compile_flags: list[str] | None = None,
     ):
@@ -172,7 +179,8 @@ def from_database(
             params=params,
             result_idx=out_idx,
             target=target,
-            kernel_global_source=kernel_global_source,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
             kernel_lib_path=kernel_lib_path,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -198,8 +206,7 @@ def __call__(self, *args: _P.args, **kwds: _P.kwargs) -> _T:
         """
         return self.torch_function(*args, **kwds)
 
-    def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
-                                    out_idx: list[int]) -> BaseKernelAdapter:
+    def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int]) -> BaseKernelAdapter:
         """
         Compiles the given TileLang PrimFunc using TVM and creates a kernel adapter.
 
@@ -218,40 +225,43 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
         target_host = self.target_host
 
         execution_backend = self.execution_backend
-        pass_configs = self.pass_configs
+        pass_configs = dict(self.pass_configs) if self.pass_configs else {}
 
         compile_flags = self.compile_flags
+        if compile_flags is not None:
+            compile_flags_cfg = pass_configs.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS)
+            pass_configs[PassConfigKey.TL_DEVICE_COMPILE_FLAGS] = (
+                compile_flags_cfg + compile_flags if compile_flags_cfg is not None else compile_flags
+            )
 
         # Compile the function with TVM, optimizing with shared memory lowering.
-        enable_host_codegen = execution_backend == "dlpack"
-        enable_device_compile = execution_backend == "dlpack"
+        enable_host_codegen = execution_backend == "tvm_ffi"
+        enable_device_compile = execution_backend == "tvm_ffi"
         with tvm.transform.PassContext(opt_level=3, config=pass_configs), self.target:
             artifact = tilelang.lower(
                 tilelang_func,
                 target=target,
                 target_host=target_host,
                 enable_host_codegen=enable_host_codegen,
-                enable_device_compile=enable_device_compile)
+                enable_device_compile=enable_device_compile,
+            )
 
         self.artifact = artifact
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            # Use TorchDLPackKernelAdapter for interoperability with PyTorch via DLPack.
+        if execution_backend == "tvm_ffi":
+            # Use TVMFFIKernelAdapter for interoperability with PyTorch via DLPack.
             # But we need to ensure that the runtime is enabled and the runtime module is not None.
-            assert tvm.runtime.enabled("llvm"), "DLPack backend requires LLVM runtime."
-            assert (artifact.rt_mod is not None), "DLPack backend requires a runtime module."
-            adapter = TorchDLPackKernelAdapter(
-                artifact.rt_mod, params=artifact.params, result_idx=out_idx)
-        elif execution_backend == "ctypes":
-            adapter = CtypesKernelAdapter(
+            assert artifact.rt_mod is not None, "tvm_ffi backend requires a runtime module."
+            adapter = TVMFFIKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
                 target=target,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                rt_mod=artifact.rt_mod,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -264,12 +274,14 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -277,7 +289,7 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -296,34 +308,50 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 # pass_configs=pass_configs,
                 # compile_flags=compile_flags,
             )
+        elif execution_backend == "cutedsl":
+            assert is_cutedsl_target(target)
+            adapter = CuTeDSLKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         else:
             # Handle invalid backend.
             raise ValueError(f"Invalid execution backend: {execution_backend}")
 
         return adapter
 
-    def _create_adapter_from_database(self,
-                                      params: list[KernelParam],
-                                      result_idx: list[int] | int,
-                                      target: str | Target,
-                                      func_or_mod: PrimFunc | tvm.runtime.Module,
-                                      kernel_global_source: str,
-                                      kernel_lib_path: str,
-                                      pass_configs: dict[str, Any] | None = None,
-                                      compile_flags: list[str] | None = None) -> BaseKernelAdapter:
+    def _create_adapter_from_database(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int] | int,
+        target: str | Target,
+        func_or_mod: PrimFunc | tvm.runtime.Module,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ) -> BaseKernelAdapter:
         target = self.target
         execution_backend = self.execution_backend
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            raise ValueError("DLPack backend is not supported for TileLang JIT.")
-        elif execution_backend == "ctypes":
-            adapter = CtypesKernelAdapter.from_database(
+        if execution_backend == "tvm_ffi":
+            adapter = TVMFFIKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -334,17 +362,33 @@ def _create_adapter_from_database(self,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
+        elif execution_backend == "cutedsl":
+            adapter = CuTeDSLKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -374,8 +418,7 @@ def from_tilelang_function(cls, tilelang_func: PrimFunc, **kwargs):
         """
         return cls(func=tilelang_func, **kwargs)
 
-    def get_profiler(self,
-                     tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
+    def get_profiler(self, tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
         """
         Creates a profiler to benchmark the compiled runtime module.
 
@@ -389,10 +432,9 @@ def get_profiler(self,
         Profiler
             A Profiler instance for benchmarking the runtime module.
         """
-        return Profiler(self.params, self.out_idx,
-                        tensor_supply_type).with_default_adapter(self.adapter)
+        return Profiler(self.params, self.out_idx, tensor_supply_type).with_default_adapter(self.adapter)
 
-    def get_kernel_source(self) -> str:
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
         """
         Returns the source code of the compiled kernel function.
 
@@ -401,14 +443,17 @@ def get_kernel_source(self) -> str:
         str
             The source code of the compiled kernel function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc"}:
-            return self.adapter.get_kernel_source()
+        if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_kernel_source(kernel_only=kernel_only)
         return self.artifact.kernel_source
 
     def get_host_source(self) -> str:
         """
         Returns the source code of the host function.
         """
+        if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_host_source()
+        assert self.artifact.host_mod is not None, "host_mod is not available"
         return str(self.artifact.host_mod)
 
     def run_once(self, func: Callable | None = None) -> None:
@@ -476,21 +521,19 @@ def export_sources(self, kernel_path: str | None = None, host_path: str | None =
                 dir_path = os.path.dirname(kernel_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(kernel_path, 'w') as f:
+                with open(kernel_path, "w") as f:
                     f.write(self.get_kernel_source())
             if host_path is not None:
                 dir_path = os.path.dirname(host_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(host_path, 'w') as f:
+                with open(host_path, "w") as f:
                     f.write(self.get_host_source())
         except Exception as e:
             logger.error(f"Failed to export sources: {e}")
 
     # Backward compatibility alias (deprecated)
-    def print_source_code(self,
-                          which: Literal["kernel", "host", "both"] = "kernel",
-                          file: str | None = None) -> None:
+    def print_source_code(self, which: Literal["kernel", "host", "both"] = "kernel", file: str | None = None) -> None:
         """
         Deprecated: use show_source() or export_sources() instead.
 
@@ -510,16 +553,14 @@ def print_source_code(self,
         >>> # Old API (still works but deprecated)
         >>> jit_kernel.print_source_code(file="/tmp/kernel.cu")
         """
-        logger.warning(
-            "print_source_code is deprecated; use show_source() or export_sources() instead.")
+        logger.warning("print_source_code is deprecated; use show_source() or export_sources() instead.")
         if file is not None:
             # Historical behavior wrote only kernel source when file provided
             self.export_sources(kernel_path=file)
         else:
             self.show_source(which=which)
 
-    def update_tuner_result(self, latency: float, config: dict[str, Any],
-                            ref_latency: float) -> JITKernel:
+    def update_tuner_result(self, latency: float, config: dict[str, Any], ref_latency: float) -> JITKernel:
         """
         Updates the tuning results for this kernel.
 
@@ -595,8 +636,17 @@ def export_library(self, kernel_file: str) -> None:
         # rt_module: use export_library to export
         # rt_params: use cloudpickle to serialize
 
-        # Export the compiled kernel function to a shared library file.
-        self.rt_module.export_library(kernel_file)
+        if self.artifact is None or self.artifact.rt_mod is None:
+            raise AttributeError(
+                'Runtime module is not available. Please compile the kernel with `execution_backend="tvm_ffi"` before exporting.'
+            )
+
+        dir_path = os.path.dirname(kernel_file)
+        if dir_path:
+            os.makedirs(dir_path, exist_ok=True)
+
+        self.artifact.rt_mod.export_library(kernel_file)
+        logger.info(f"Kernel library exported to {os.path.abspath(kernel_file)}")
 
     def _get_ptx(self, verbose: bool | None = None) -> str:
         """
@@ -620,8 +670,7 @@ def _get_ptx(self, verbose: bool | None = None) -> str:
             verbose = self.verbose
         # Ensure target is set so nvcc picks correct arch via Target.current()
         with self.target:
-            return tl_nvcc.get_ptx_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_ptx_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_ptx(self) -> None:
         """
@@ -683,8 +732,7 @@ def _get_sass(self, verbose: bool | None = None) -> str:
         if verbose is None:
             verbose = self.verbose
         with self.target:
-            return tl_nvcc.get_sass_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_sass_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_sass(self) -> None:
         """
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 43c721bbb..f66a29ee2 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 # from .parser import *
@@ -10,20 +11,21 @@
 from . import overrides as _overrides  # noqa: F401
 
 # from .tir import prim_func, macro,  # noqa: F401
-from .v2 import *  # noqa: F401
+from .eager import *  # noqa: F401
 from .tir.ir import *  # noqa: F401
 from tilelang.layout import Layout, Fragment  # noqa: F401
-from .proxy import (
-    ptr,  # noqa: F401
-    make_tensor,  # noqa: F401
-    Buffer,  # noqa: F401
-    Tensor,  # noqa: F401
-    StridedTensor,  # noqa: F401
-    FragmentBuffer,  # noqa: F401
-    SharedBuffer,  # noqa: F401
-    LocalBuffer,  # noqa: F401
+from .proxy import ptr, make_tensor, Buffer, Tensor, StridedTensor, FragmentBuffer, SharedBuffer, LocalBuffer  # noqa: F401
+from .loop import (
+    Parallel,  # noqa: F401
+    Persistent,  # noqa: F401
+    Pipelined,  # noqa: F401
+    serial,  # noqa: F401
+    unroll,  # noqa: F401
+    vectorized,  # noqa: F401
+    Serial,  # noqa: F401
+    Unroll,  # noqa: F401
+    Vectorized,  # noqa: F401
 )
-from .loop import serial, Parallel, Persistent, Pipelined  # noqa: F401
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (
@@ -47,12 +49,14 @@
     alloc_wgmma_desc,  # noqa: F401
     alloc_tcgen05_smem_desc,  # noqa: F401
     alloc_tcgen05_instr_desc,  # noqa: F401
+    empty,  # noqa: F401
 )
-from .copy import copy, c2d_im2col  # noqa: F401
-from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
-from .experimental.gemm_sp import gemm_sp  # noqa: F401
-from .fill import fill, clear  # noqa: F401
-from .reduce import (
+from .copy_op import copy, c2d_im2col  # noqa: F401
+from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
+from .gemm_op import gemm, gemm_v1, gemm_v2  # noqa: F401
+from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
+from .fill_op import fill, clear  # noqa: F401
+from .reduce_op import (
     reduce,  # noqa: F401
     reduce_max,  # noqa: F401
     reduce_min,  # noqa: F401
@@ -64,8 +68,13 @@
     reduce_bitxor,  # noqa: F401
     cumsum,  # noqa: F401
     finalize_reducer,  # noqa: F401
+    warp_reduce_sum,  # noqa: F401
+    warp_reduce_max,  # noqa: F401
+    warp_reduce_min,  # noqa: F401
+    warp_reduce_bitand,  # noqa: F401
+    warp_reduce_bitor,  # noqa: F401
 )
-from .print import print, device_assert  # noqa: F401
+from .print_op import print, device_assert  # noqa: F401
 from .customize import (
     atomic_max,  # noqa: F401
     atomic_min,  # noqa: F401
@@ -82,12 +91,36 @@
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
+from .builtin import ldg32 as ldg32  # noqa: F401
+from .builtin import ldg64 as ldg64  # noqa: F401
+from .builtin import ldg128 as ldg128  # noqa: F401
+from .builtin import ldg256 as ldg256  # noqa: F401
+from .builtin import stg32 as stg32  # noqa: F401
+from .builtin import stg64 as stg64  # noqa: F401
+from .builtin import stg128 as stg128  # noqa: F401
+from .builtin import stg256 as stg256  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
 from .symbolics import dynamic, symbolic  # noqa: F401
 from .annotations import (  # noqa: F401
-    use_swizzle, annotate_layout, annotate_safe_value, annotate_l2_hit_ratio,
+    use_swizzle,
+    annotate_layout,
+    annotate_safe_value,
+    annotate_l2_hit_ratio,
+    annotate_restrict_buffers,
+)
+
+from .random import (
+    rng_init,  # noqa: F401
+    rng_rand,  # noqa: F401
+    rng_rand_float,  # noqa: F401
+)
+
+from .pdl import (
+    pdl_trigger,  # noqa: F401
+    pdl_sync,  # noqa: F401
 )
 
 
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index d70355adb..60c553e80 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 from typing import overload, Literal
+from tilelang._typing import DType, ShapeType
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
@@ -23,8 +24,13 @@
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
 
+from . import dtypes as _dtypes
+from .dtypes import dtype as tl_dtype
+from .eager.builder import OutTensor
+from .proxy import Tensor
 
-def alloc_shared(shape, dtype, scope="shared.dyn"):
+
+def alloc_shared(shape: ShapeType, dtype: DType, scope="shared.dyn") -> Buffer:
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -42,7 +48,7 @@ def alloc_shared(shape, dtype, scope="shared.dyn"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape, dtype, scope="local"):
+def alloc_local(shape: ShapeType, dtype: DType, scope="local") -> Buffer:
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -56,7 +62,7 @@ def alloc_local(shape, dtype, scope="local"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape, dtype, scope="local.fragment"):
+def alloc_fragment(shape: ShapeType, dtype: DType, scope="local.fragment") -> Buffer:
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -71,19 +77,14 @@ def alloc_fragment(shape, dtype, scope="local.fragment"):
 
 
 @overload
-def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = 'local.var') -> Buffer:
-    ...
+def alloc_var(dtype: DType, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
 
 
 @overload
-def alloc_var(dtype: str,
-              scope: str = 'local.var',
-              *,
-              init: PrimExpr | int | float | None = None) -> Buffer:
-    ...
+def alloc_var(dtype: DType, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
 
 
-def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
+def alloc_var(dtype: DType, *args, scope: str = "local.var", init: PrimExpr | int | float | None = None) -> Buffer:
     """Allocate a single-element variable buffer.
 
     Args:
@@ -126,8 +127,7 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
             raise TypeError("Scope must be provided as a string in alloc_var.")
         parsed_scope = parsed_scope_arg
     elif len(args) > 2:
-        raise TypeError(
-            f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
+        raise TypeError(f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
 
     if not isinstance(parsed_scope, str):
         raise TypeError("Scope must be a string in alloc_var.")
@@ -135,25 +135,41 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
     buffer = T.alloc_buffer([1], dtype, scope=parsed_scope)
     if parsed_init is not None:
         if isinstance(parsed_init, (int, float, IntImm, FloatImm)):
-            block_attr({"tl.local_var_init": {buffer.data: parsed_init}})
+            block_attr({"tl.local_var_init": {buffer.data: tl_dtype(dtype)(parsed_init)}})
         else:
             T.buffer_store(buffer, parsed_init, 0)
     return buffer
 
 
-def alloc_barrier(arrive_count: int):
+def alloc_barrier(arrive_count: int | list[int]) -> Buffer:
     """Allocate a barrier buffer.
 
     Args:
-        arrive_count (int): The number of threads that need to arrive at the barrier
+        arrive_count (int | list[int]): The number of threads that need to arrive at each barrier
 
     Returns:
         T.Buffer: A TVM buffer object allocated as a barrier
+
+    Examples
+    --------
+    >>> mbar = alloc_barrier(128)  # allocate a barrier with arrive count 128
+    >>> mbars = alloc_barrier([128] * n)  # allocate n barriers with the same arrive count 128
     """
-    return T.alloc_buffer([arrive_count], "uint64", scope="shared.barrier")
+    # Normalize to list
+    if isinstance(arrive_count, int):
+        arrive_count = [arrive_count]
+    else:
+        arrive_count = list(arrive_count)
+    buffer = T.alloc_buffer((len(arrive_count),), _dtypes.uint64, scope="shared.barrier")
+    # Convert to TIR IntImm expressions for C++ pass to consume as Map<Var, Array<PrimExpr>>
+    # Use buffer.data as key to support multiple barrier buffer allocations
+    arrive_count_exprs = [IntImm("int32", c) for c in arrive_count]
+    block_attr({"barrier_init": {buffer.data: arrive_count_exprs}})
 
+    return buffer
 
-def alloc_tmem(shape, dtype):
+
+def alloc_tmem(shape: ShapeType, dtype: DType) -> Buffer:
     """
     Allocate a Tensor Memory (TMEM) buffer for use with 5th generation Tensor Core operations (e.g., TCGEN5.MMA).
 
@@ -182,7 +198,10 @@ def alloc_tmem(shape, dtype):
     return T.alloc_buffer(shape, dtype, scope="shared.tmem")
 
 
-def alloc_reducer(shape, dtype, op="sum", replication=None):
+ReducerOp = Literal["sum", "max", "min"]
+
+
+def alloc_reducer(shape: ShapeType, dtype: DType, op: ReducerOp = "sum", replication=None) -> Buffer:
     """
     Allocate a reducer buffer.
 
@@ -223,8 +242,8 @@ def alloc_reducer(shape, dtype, op="sum", replication=None):
 
 def alloc_descriptor(
     kind: DescKind = "wgmma",
-    dtype: str = "uint64",
-):
+    dtype: DType = _dtypes.uint64,
+) -> Buffer:
     """Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
 
     Args:
@@ -240,18 +259,33 @@ def alloc_descriptor(
     return T.alloc_buffer([1], dtype, scope=scope)
 
 
-def alloc_wgmma_desc(dtype: str = "uint64"):
+def alloc_wgmma_desc(dtype: DType = _dtypes.uint64) -> Buffer:
     return alloc_descriptor("wgmma", dtype=dtype)
 
 
-def alloc_tcgen05_smem_desc(dtype: str = "uint64"):
+def alloc_tcgen05_smem_desc(dtype: DType = _dtypes.uint64) -> Buffer:
     return alloc_descriptor("tcgen05_smem", dtype=dtype)
 
 
-def alloc_tcgen05_instruction_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instruction_desc(dtype: DType = _dtypes.uint32) -> Buffer:
     return alloc_descriptor("tcgen05_instr", dtype=dtype)
 
 
 # Alias: short name consistent with imports
-def alloc_tcgen05_instr_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instr_desc(dtype: DType = _dtypes.uint32) -> Buffer:
     return alloc_tcgen05_instruction_desc(dtype)
+
+
+@overload
+def empty(shape, dtype: DType = _dtypes.float32) -> Tensor: ...
+
+
+def empty(*shape, dtype: DType = _dtypes.float32) -> Tensor:
+    if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
+        return OutTensor(shape[0], dtype)
+    elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
+        return OutTensor(shape[0], shape[1])
+    elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
+        return OutTensor(shape, dtype)
+    else:
+        raise TypeError(f"Invalid shape {shape}")
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 12d3af4d3..6e95cdafe 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -1,16 +1,18 @@
 """Annotation helpers exposed on the TileLang language surface."""
-from __future__ import annotations
 
 from typing import Callable
 
-from tilelang.layout import Layout
+from tilelang.layout import Fragment, Layout
+from tilelang.utils.language import is_fragment
 from tvm.script.parser.tir import attr, block_attr
+from tvm.tir import FloatImm
 
 __all__ = [
     "use_swizzle",
     "annotate_layout",
     "annotate_safe_value",
     "annotate_l2_hit_ratio",
+    "annotate_restrict_buffers",
 ]
 
 
@@ -26,6 +28,8 @@ def annotate_layout(layout_map: dict):
     """Annotate the layout of the buffer."""
     _layout_map = {}
     for buffer, layout in layout_map.items():
+        if is_fragment(buffer):
+            assert isinstance(layout, Fragment), f"for Fragment {buffer}, layout must be a Fragment, but got {type(layout)}"
         if isinstance(layout, Layout):
             _layout_map[buffer.data] = layout
         elif isinstance(layout, Callable):
@@ -49,5 +53,33 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
     _l2_hit_ratio_map = {}
     for buffer, hit_ratio in l2_hit_ratio_map.items():
         assert buffer.scope() == "global", "persistent L2 can only be applied to global buffers"
-        _l2_hit_ratio_map[buffer.data] = float(hit_ratio)
+        _l2_hit_ratio_map[buffer.data] = FloatImm("float32", float(hit_ratio))
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
+
+
+def annotate_restrict_buffers(*buffers):
+    """Mark the given buffer parameters as non-restrict.
+
+    This annotation tells codegen to omit the `__restrict__` qualifier for the
+    specified kernel buffer parameters. Use this when two (or more) buffers may
+    alias, for example overlapping slices from the same base tensor.
+
+    Example
+    -------
+    >>> @T.prim_func
+    ... def buggy_kernel(x: T.Tensor((N,), T.float32),
+    ...                  y: T.Tensor((N,), T.float32)):
+    ...     T.annotate_restrict_buffers(x, y)
+    ...     with T.Kernel(N, threads=32) as pid:
+    ...         y[pid] = x[pid] + 1
+    """
+    if not buffers:
+        return None
+    data_vars = []
+    for buf in buffers:
+        try:
+            data_vars.append(buf.data)
+        except Exception as e:
+            raise TypeError(f"annotate_restrict_buffers expects Buffer arguments, got {type(buf)}") from e
+    # Also return as block attribute (root block exists by default) for readability/tools.
+    return block_attr({"tl.non_restrict_params": data_vars})
diff --git a/tilelang/language/ast/__init__.py b/tilelang/language/ast/__init__.py
index 9d7745442..6ab6249b1 100644
--- a/tilelang/language/ast/__init__.py
+++ b/tilelang/language/ast/__init__.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """Package tvm.script.ir_builder.tir"""
+
 from .ir import *  # noqa: F401
 from .ir import boolean as bool  # noqa: F401
 from .ir import buffer as Buffer  # noqa: F401
diff --git a/tilelang/language/ast/_ffi_api.py b/tilelang/language/ast/_ffi_api.py
index 518d57ea8..5cc74762a 100644
--- a/tilelang/language/ast/_ffi_api.py
+++ b/tilelang/language/ast/_ffi_api.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """FFI APIs"""
+
 import tvm.ffi
 
 tvm.ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index 41b658d7c..d2f56598b 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -26,6 +26,7 @@
 
 # isort: off
 from typing_extensions import Literal
+from tilelang._typing import ShapeType, DType
 
 # isort: on
 
@@ -86,21 +87,22 @@
 
 from . import _ffi_api
 from tvm.script.ir_builder.tir import frame
+from tilelang.language import dtypes as _dtypes
 
 # pylint: enable=unused-import
 
 
 def buffer(
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | tir.PrimExpr | Integral,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = 0,
     offset_factor: int = 0,
     buffer_type: str = "",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer:
     """The buffer declaration function.
 
@@ -109,16 +111,16 @@ def buffer(
     shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType, optional
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -133,7 +135,7 @@ def buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
     Returns
@@ -143,7 +145,7 @@ def buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, _dtypes.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.Buffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -243,16 +245,16 @@ def func_ret(ret_type: Type) -> Type:
 
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
-    dtype: str = "float32",
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | PrimExpr | Integral | None = None,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer:
     """The buffer match function.
 
@@ -266,30 +268,30 @@ def match_buffer(
     -------
     Match buffer from function parameter
     .. code-block:: python
-        A = T.match_buffer(a, (128, 128), dtype="float32")
+        A = T.match_buffer(a, (128, 128), dtype=T.float32)
 
     Match buffer from Buffer subregion
     .. code-block:: python
-        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
+        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype=T.float32)
 
     Parameters
     ----------
     param : Union[Var, BufferLoad, BufferRegion]
         The parameter of the PrimFunc to match.
 
-    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral, None]
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -304,7 +306,7 @@ def match_buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
     Returns
@@ -320,7 +322,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else T.int32
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -439,34 +441,34 @@ def block_attr(attrs: Dict[str, Any]) -> None:
 
 
 def alloc_buffer(
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | PrimExpr | Integral,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer:
     """The buffer allocation function.
 
     Parameters
     ----------
-    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+    shape : ShapeType | PrimExpr | Integral
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -481,7 +483,7 @@ def alloc_buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
     Returns
@@ -491,7 +493,7 @@ def alloc_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -537,7 +539,7 @@ class axis:  # pylint: disable=invalid-name
     def spatial(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -558,13 +560,14 @@ def spatial(
             The iteration variable.
         """
         return _ffi_api.AxisSpatial(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def reduce(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -585,13 +588,14 @@ def reduce(
             The iteration variable.
         """
         return _ffi_api.AxisReduce(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def scan(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -612,13 +616,14 @@ def scan(
             The iteration variable.
         """
         return _ffi_api.AxisScan(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def opaque(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -639,10 +644,11 @@ def opaque(
             The iteration variable.
         """
         return _ffi_api.AxisOpaque(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
-    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
+    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = T.int32) -> Union[List[Var], Var]:
         """The block axis remapping function.
 
         Parameters
@@ -662,17 +668,15 @@ def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[L
             The iteration variables.
         """
         iter_vars = _ffi_api.AxisRemap(  # type: ignore[attr-defined] # pylint: disable=no-member
-            kinds, bindings, dtype)
+            kinds, bindings, dtype
+        )
         return iter_vars[0] if len(iter_vars) == 1 else iter_vars
 
     S = spatial  # pylint: disable=invalid-name
     R = reduce  # pylint: disable=invalid-name
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -700,10 +704,7 @@ def serial(start: PrimExpr,
     return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -731,10 +732,7 @@ def parallel(start: PrimExpr,
     return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -762,10 +760,7 @@ def vectorized(start: PrimExpr,
     return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -795,10 +790,10 @@ def unroll(start: PrimExpr,
 
 def thread_binding(
     start: PrimExpr,
-    stop: PrimExpr = None,
-    thread: str = None,
+    stop: Optional[PrimExpr] = None,
+    thread: Optional[str] = None,
     *,
-    annotations: Dict[str, Any] = None,
+    annotations: Optional[Dict[str, Any]] = None,
 ) -> frame.ForFrame:
     """The thread-binding For statement.
 
@@ -807,13 +802,13 @@ def thread_binding(
     start : PrimExpr
         The minimum value of iteration.
 
-    stop : PrimExpr
+    stop : PrimExpr, optional
         The maximum value of iteration.
 
-    thread : str
+    thread : str, optional
         The thread for loop variable to bind.
 
-    annotations : Dict[str, Any]
+    annotations : Dict[str, Any], optional
         The optional annotations of the For statement.
 
     Returns
@@ -837,7 +832,8 @@ def thread_binding(
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
-        start, stop, thread, annotations)
+        start, stop, thread, annotations
+    )
 
 
 def grid(*extents: PrimExpr) -> frame.ForFrame:
@@ -878,10 +874,10 @@ def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame:  # pylint: d
 
 
 def LetStmt(  # pylint: disable=invalid-name
-        value: PrimExpr,
-        type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
-        *,
-        var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
+    value: PrimExpr,
+    type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
+    *,
+    var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
 ) -> frame.LetFrame:
     """Create a LetStmt binding
 
@@ -909,8 +905,8 @@ def LetStmt(  # pylint: disable=invalid-name
 
 
 def Let(  # pylint: disable=invalid-name
-        expr: PrimExpr,
-        where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
+    expr: PrimExpr,
+    where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
 ) -> PrimExpr:
     """Create a Let expression binding"""
     assert len(where) == 1, "T.Let only allows `where` to have exactly one element"
@@ -921,7 +917,7 @@ def Let(  # pylint: disable=invalid-name
 def let(
     v: Var,
     value: PrimExpr,
-    body: PrimExpr = None,
+    body: Optional[PrimExpr] = None,
 ) -> frame.LetFrame:
     """Create a new let binding.
 
@@ -933,7 +929,7 @@ def let(
     value : PrimExpr
         The value to be bound.
 
-    body : PrimExpr
+    body : PrimExpr, optional
         The body expression, None will be used if it was not specified.
 
     Returns
@@ -980,14 +976,15 @@ def realize(
         The result RealizeFrame.
     """
     return _ffi_api.Realize(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer_slice, storage_scope, condition)
+        buffer_slice, storage_scope, condition
+    )
 
 
 def allocate(
     extents: List[PrimExpr],
     dtype: str,
     scope: str = "global",
-    condition: PrimExpr = None,
+    condition: Optional[PrimExpr] = None,
     annotations=None,
 ) -> frame.AllocateFrame:
     """Allocate node.
@@ -1003,7 +1000,7 @@ def allocate(
     scope : str
         The storage scope.
 
-    condition : PrimExpr
+    condition : PrimExpr, optional
         The condition.
 
     annotations: Optional[Mapping[str, Object]]
@@ -1012,7 +1009,8 @@ def allocate(
     if isinstance(condition, bool):
         condition = IntImm("bool", condition)
     return _ffi_api.Allocate(  # type: ignore[attr-defined] # pylint: disable=no-member
-        extents, dtype, scope, condition, annotations)
+        extents, dtype, scope, condition, annotations
+    )
 
 
 def allocate_const(
@@ -1048,7 +1046,8 @@ def allocate_const(
         np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np_data), dtype, extents, annotations)
+        ndarray.array(np_data), dtype, extents, annotations
+    )
 
 
 def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame:
@@ -1135,8 +1134,8 @@ def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
 
 
 def decl_buffer(
-    shape,
-    dtype="float32",
+    shape: ShapeType,
+    dtype=T.float32,
     data=None,
     strides=None,
     elem_offset=None,
@@ -1187,7 +1186,7 @@ def decl_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.DeclBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1240,7 +1239,7 @@ def launch_thread(
     return _ffi_api.LaunchThread(thread, extent)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def env_thread(thread_tag: str, dtype: str = "int32") -> IterVar:
+def env_thread(thread_tag: str, dtype: str = T.int32) -> IterVar:
     """Bind a var to thread env
 
     Parameters
@@ -1297,7 +1296,8 @@ def buffer_store(
     if isinstance(value, bool) and buffer.dtype == "bool":
         value = IntImm("bool", value)
     return _ffi_api.BufferStore(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer, value, expr_indices)
+        buffer, value, expr_indices
+    )
 
 
 def prefetch(
@@ -1464,10 +1464,7 @@ def boolean(expr: Optional[PrimExpr] = None, is_size_var: bool = False) -> PrimE
     return _ffi_api.Boolean(expr, is_size_var)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def handle(dtype: Optional[str] = None,
-           storage_scope: str = "global",
-           *,
-           is_size_var: bool = False) -> Var:
+def handle(dtype: Optional[str] = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -1661,13 +1658,13 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     args = []
     for name, i in zip(params.keys(), identity + identity):
         if isinstance(i, int):
-            args.append(Var(name, "int32"))
+            args.append(Var(name, T.int32))
         else:
             args.append(Var(name, i.dtype))
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
-    return CommReducer(args[:num_args // 2], args[num_args // 2:], res, identity)
+    return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
 
 
 def index_map(
@@ -1700,16 +1697,15 @@ def target(
         The target.
     """
     if not isinstance(target_config, (str, dict)):
-        raise ValueError(
-            f"T.target expected a config dict or string, but got {type(target_config)}")
+        raise ValueError(f"T.target expected a config dict or string, but got {type(target_config)}")
     if host is not None and not isinstance(host, (str, dict, Target)):
-        raise ValueError("T.target expected the host to be "
-                         "a config dict, string, or T.target, "
-                         f"but got {type(host)}")
+        raise ValueError(f"T.target expected the host to be a config dict, string, or T.target, but got {type(host)}")
     if isinstance(target_config, dict) and "host" in target_config and host is not None:
-        raise ValueError("T.target expects to either receive the host "
-                         "as part of the target's config dictionary, "
-                         "or as a separate argument, but not both.")
+        raise ValueError(
+            "T.target expects to either receive the host "
+            "as part of the target's config dictionary, "
+            "or as a separate argument, but not both."
+        )
     return Target(target_config, host)
 
 
@@ -1742,7 +1738,6 @@ def __init__(self, value: Any) -> None:
         self.value = value
 
     def __iter__(self):
-
         def f():
             for i in self.value:
                 yield meta_var(i)
@@ -1754,7 +1749,6 @@ def f():
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -1874,7 +1868,6 @@ def wrapped(*args, **kwargs):
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index f1b37d236..b1521b7d0 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -1,13 +1,12 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-"""Atomic operations for tilelang."""
+"""Atomic operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 import tilelang.language as T
-from tvm import ir, tir
-from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
-from tilelang.language.utils import buffer_to_tile_region, buffer_region_to_tile_region, buffer_load_to_tile_region
-from tilelang.utils.language import get_buffer_region_from_load
+from tvm import ir
+from tvm.tir import PrimExpr, Buffer, op
+from tilelang.utils.language import to_buffer_region, legalize_pairwise_extents
+from tilelang.language.utils import get_extent
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -19,14 +18,11 @@
 }
 
 
-def atomic_max(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
-    If memory_order is None the runtime extern "AtomicMax" is called without an explicit memory-order id; otherwise the provided memory_order string is mapped to a numeric id using the module's memory-order map and passed to the extern.
+    Supports scalar/addressed extern atomic max when neither argument exposes extents, or tile-region-based atomic max for Buffer/BufferRegion/BufferLoad inputs. If both arguments are plain Buffers their shapes must be structurally equal. If at least one side exposes extents, extents are aligned (missing dimensions are treated as size 1); an assertion is raised if extents cannot be deduced. The optional `memory_order` (one of "relaxed","consume","acquire","release","acq_rel","seq_cst") is used only for the direct extern `AtomicMax` path when no extents are available — otherwise the tile-region path ignores `memory_order`.
 
     Parameters:
         dst (Buffer): Destination buffer/address to apply the atomic max.
@@ -55,26 +51,60 @@ def atomic_max(dst: Buffer,
         >>> def find_max(data: T.Buffer, result: T.Buffer):
         >>>     for i in T.thread_binding(128, "threadIdx.x"):
         >>>         atomic_max(result, data[i])
+
+        >>> # Tensor-to-tensor atomic max (tile-region based)
+        >>> src_tensor = T.Tensor([128, 64], "float32", name="src")
+        >>> dst_tensor = T.Tensor([128, 64], "float32", name="dst")
+        >>> atomic_max(dst_tensor, src_tensor)  # Max entire tensors atomically
     """
-    func_name = "AtomicMaxRet" if return_prev else "AtomicMax"
-    return_type = dst.dtype if return_prev else "handle"
 
-    if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
-    else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+    src_extent = get_extent(value)
+    dst_extent = get_extent(dst)
+
+    if dst_extent is None and src_extent is None:
+        # Scalar path: use atomicmax_elem_op intrinsic
+        return_type = dst.dtype if return_prev else "handle"
+        memory_order_id = _MEMORY_ORDER_ID_MAP[memory_order] if memory_order else 0
+
+        return T.call_intrin(
+            return_type,
+            op.Op.get("tl.atomic_max_elem_op"),
+            T.address_of(dst),
+            value,
+            memory_order_id,
+        )
+
+    # When both arguments are Buffer, we can check whether they are structural equal.
+    if isinstance(dst, Buffer) and isinstance(value, Buffer):
+        ir.assert_structural_equal(dst.shape, value.shape)
 
+    assert src_extent or dst_extent, "Can't deduce atomicmax extents from args"
 
-def atomic_min(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    if return_prev:
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+
+    ann = {}
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
+
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicmax"), value, dst, annotations=ann if ann else None)
+
+
+def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
-    If memory_order is provided, it selects the memory-order semantic used by the underlying extern call;
-    allowed names are "relaxed", "consume", "acquire", "release", "acq_rel", and "seq_cst" (mapped internally
-    to integer IDs). If memory_order is None, the extern is invoked without an explicit memory-order argument.
+    Supports scalar/addressed extern atomic min when neither argument exposes extents, or tile-region-based atomic min for Buffer/BufferRegion/BufferLoad inputs. If both arguments are plain Buffers their shapes must be structurally equal. If at least one side exposes extents, extents are aligned (missing dimensions are treated as size 1); an assertion is raised if extents cannot be deduced. The optional `memory_order` (one of "relaxed","consume","acquire","release","acq_rel","seq_cst") is used only for the direct extern `AtomicMin` path when no extents are available — otherwise the tile-region path ignores `memory_order`.
 
     Parameters:
         dst (Buffer): Destination buffer/address to apply the atomic min.
@@ -103,21 +133,56 @@ def atomic_min(dst: Buffer,
 
         >>> # With relaxed memory ordering for performance
         >>> atomic_min(min_val, 5, memory_order="relaxed")
+
+        >>> # Tensor-to-tensor atomic min (tile-region based)
+        >>> src_tensor = T.Tensor([128, 64], "float32", name="src")
+        >>> dst_tensor = T.Tensor([128, 64], "float32", name="dst")
+        >>> atomic_min(dst_tensor, src_tensor)  # Min entire tensors atomically
     """
-    func_name = "AtomicMinRet" if return_prev else "AtomicMin"
-    return_type = dst.dtype if return_prev else "handle"
 
-    if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
-    else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+    src_extent = get_extent(value)
+    dst_extent = get_extent(dst)
+
+    if dst_extent is None and src_extent is None:
+        # Scalar path: use atomicmin_elem_op intrinsic
+        return_type = dst.dtype if return_prev else "handle"
+        memory_order_id = _MEMORY_ORDER_ID_MAP[memory_order] if memory_order else 0
+
+        return T.call_intrin(
+            return_type,
+            op.Op.get("tl.atomic_min_elem_op"),
+            T.address_of(dst),
+            value,
+            memory_order_id,
+        )
 
+    # When both arguments are Buffer, we can check whether they are structural equal.
+    if isinstance(dst, Buffer) and isinstance(value, Buffer):
+        ir.assert_structural_equal(dst.shape, value.shape)
+
+    assert src_extent or dst_extent, "Can't deduce atomicmin extents from args"
+
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-def atomic_add(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False,
-               use_tma: bool = False) -> PrimExpr:
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    if return_prev:
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+
+    ann = {}
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
+
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicmin"), value, dst, annotations=ann if ann else None)
+
+
+def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
     """
     Atomically add `value` into `dst`, returning a handle to the operation.
 
@@ -162,76 +227,55 @@ def atomic_add(dst: Buffer,
         >>> atomic_add(global_grad, gradients)
     """
 
-    def get_extent(data):
-        """
-        Return the inferred extent (shape) of a buffer-like object.
-
-        If `data` is a Var bound to a let value, the let value is resolved before inspection.
-        Parameters:
-            data: A Var, Buffer, or BufferRegion to inspect.
-
-        Returns:
-            The shape/extents as a list-like of PrimExpr (Buffer.shape or list of region item extents), or None if the extent cannot be determined.
-        """
-        if isinstance(data, Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, Buffer):
-            return data.shape
-        elif isinstance(data, BufferRegion):
-            return [x.extent for x in data.region]
-        else:
-            return None
-
     src_extent = get_extent(value)
     dst_extent = get_extent(dst)
 
+    # Thread-level atomic add, where both extent can't be inferred
     if dst_extent is None and src_extent is None:
-        func_name = "AtomicAddRet" if return_prev else "AtomicAdd"
+        atomic_add_op = op.Op.get("tl.atomic_add_ret_elem_op") if return_prev else op.Op.get("tl.atomic_add_elem_op")
         return_type = dst.dtype if return_prev else "handle"
 
+        # Pass destination by pointer to match device signature
         if memory_order is None:
-            return T.call_extern(return_type, func_name, dst, value)
+            return T.call_intrin(return_type, atomic_add_op, T.address_of(dst), value)
         else:
-            return T.call_extern(return_type, func_name, dst, value,
-                                 _MEMORY_ORDER_ID_MAP[memory_order])
-
+            return T.call_intrin(
+                return_type,
+                atomic_add_op,
+                T.address_of(dst),
+                value,
+                _MEMORY_ORDER_ID_MAP[memory_order],
+            )
+
+    # When both arguments are Buffer, we can check whether they are structural equal.
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
 
     assert src_extent or dst_extent, "Can't deduce atomicadd extents from args"
+
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
-    extent = max(src_extent, dst_extent)
-
-    def _to_region(data, access_type):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            return buffer_to_tile_region(data, access_type)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    value = _to_region(value, "r")
-    dst = _to_region(dst, "w")
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
     if return_prev:
-        raise NotImplementedError(
-            "return_prev is not supported for tile-region-based atomic operations")
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+
+    # Build annotations dict
+    ann = {}
+    if use_tma:
+        ann["use_tma"] = 1
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
 
-    if memory_order is None:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma, 0)
-    else:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma,
-                             _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, annotations=ann if ann else None)
 
 
 def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
@@ -267,9 +311,9 @@ def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>>         for j in range(0, grads.shape[1], 2):  # Process in pairs
         >>>             atomic_addx2(global_grads[i, j:j+2], grads[i, j:j+2])
     """
-    func_name = "AtomicAddx2Ret" if return_prev else "AtomicAddx2"
+    atomic_addx2_op = op.Op.get("tl.atomic_addx2_elem_op") if return_prev else op.Op.get("tl.atomic_addx2_elem_op")
     return_type = dst.dtype if return_prev else "handle"
-    return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
+    return T.call_intrin(return_type, atomic_addx2_op, T.address_of(dst), T.address_of(value))
 
 
 def atomic_addx4(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
@@ -305,9 +349,9 @@ def atomic_addx4(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>> rgba_add = T.Tensor([4], "float32", name="rgba_add")
         >>> atomic_addx4(rgba_dst, rgba_add)  # Atomic blend of all 4 channels
     """
-    func_name = "AtomicAddx4Ret" if return_prev else "AtomicAddx4"
+    atomic_addx4_op = op.Op.get("tl.atomic_addx4_elem_op") if return_prev else op.Op.get("tl.atomic_addx4_elem_op")
     return_type = "float4" if "float" in str(dst.dtype).lower() else "handle"
-    return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
+    return T.call_intrin(return_type, atomic_addx4_op, T.address_of(dst), T.address_of(value))
 
 
 def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
@@ -346,7 +390,7 @@ def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
         >>> counter = T.Tensor([1], "int64", name="counter")
         >>> current_count = atomic_load(counter, memory_order="relaxed")
     """
-    return T.call_extern(src.dtype, "AtomicLoad", src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_intrin(src.dtype, op.Op.get("tl.atomic_load_elem_op"), T.address_of(src), _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> PrimExpr:
@@ -399,4 +443,4 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> log_counter = T.Tensor([1], "int64", name="log_counter")
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
-    return T.call_extern("handle", "AtomicStore", dst, src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_intrin("handle", op.Op.get("tl.atomic_store_elem_op"), T.address_of(dst), src, _MEMORY_ORDER_ID_MAP[memory_order])
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index e40d1f0d0..fef5dc983 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -1,14 +1,16 @@
-"""The language interface for tl programs."""
+"""Builtin operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
+from tilelang._typing import BufferLikeType, BufferLikeTypeTuple, BarrierType, DType
 from tilelang import tvm as tvm
 from tilelang.language import ptx_arrive_barrier, evaluate
 from tilelang.language.kernel import get_thread_bindings, get_block_extents
 from tilelang.utils.target import check_hip_availability
 from tvm import DataType, tir
 from tvm.runtime import convert
-from typing import Any
 from tvm.tir import PrimExpr, Var, Call, BufferLoad, BufferRegion
+from tilelang.utils.language import retrieve_ptr
 
 _IS_HIP_AVAILABLE = check_hip_availability()
 
@@ -27,48 +29,52 @@ def _normalize_index_arg(value: int | PrimExpr | None) -> PrimExpr | None:
     raise TypeError(f"Expect warp sizing argument to be int or PrimExpr, but got {type(value)}.")
 
 
-def create_list_of_mbarrier(*args: Any) -> Call:
-    """
-    Create a list of memory barrier handles.
-
-    Parameters
-    ----------
-    *args : list or Any
-        Either a single list of arguments, or multiple arguments directly.
-
-    Returns
-    -------
-    tvm.tir.Call
-        Handle to the created list of memory barriers.
+def _mbar_to_buffer_load(mbar: BarrierType) -> BufferLoad:
+    """Convert a memory barrier to a buffer load.
 
-    Raises
-    ------
-    TypeError
-        If the input is not a list or variadic arguments.
+    Args:
+        mbar: BarrierType
+            The memory barrier to convert
 
-    Examples
-    --------
-    >>> create_list_of_mbarrier([128, 128])
-    >>> create_list_of_mbarrier(128, 128)
+    Returns:
+        tir.BufferLoad: A buffer load of the memory barrier
     """
-    if len(args) == 1 and isinstance(args[0], list):
-        return tir.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), *args[0])
-    elif len(args) >= 1:
-        return tir.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), *args)
+    if isinstance(mbar, tir.BufferLoad):
+        return mbar
+    elif isinstance(mbar, tir.Buffer):
+        assert len(mbar.shape) == 1, f"mbarrier must be a single element buffer, but got {mbar.shape}"
+        return tir.BufferLoad(mbar, [0])
     else:
-        raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
+        raise TypeError(f"mbarrier must be an tir.BufferLoad or a tir.Buffer, but got {type(mbar)}")
 
 
-def get_mbarrier(*args):
-    """Retrieve a memory barrier operation.
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
 
     Args:
-        *args: Variable arguments to specify which memory barrier to retrieve
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
 
     Returns:
-        tir.Call: A handle to the requested memory barrier
+        PrimExpr: The loaded value.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), *args)
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
 
 
 def create_tma_descriptor(*args):
@@ -83,6 +89,10 @@ def create_tma_descriptor(*args):
     return tir.call_intrin("handle", tir.op.Op.get("tl.create_tma_descriptor"), *args)
 
 
+# NOTE(wt): T.create_list_of_mbarrier and T.get_mbarrier is now only an intermediate intrinsic
+# during transforms, and won't be exposed to frontend. For creating mbarriers, please use T.alloc_barrier instead.
+
+
 def tma_load(*args):
     """Perform a Tensor Memory Access (TMA) load operation.
 
@@ -150,119 +160,101 @@ def set_max_nreg(reg_count: int, is_inc: int):
 
 
 def inc_max_nreg(reg_count: int):
-    """Increment the maximum number of registers to use.
-    """
+    """Increment the maximum number of registers to use."""
     return set_max_nreg(reg_count, 1)
 
 
 def dec_max_nreg(reg_count: int):
-    """Decrement the maximum number of registers to use.
-    """
+    """Decrement the maximum number of registers to use."""
     return set_max_nreg(reg_count, 0)
 
 
 def annotate_producer_reg_dealloc(reg_count: int = 24):
-    """Annotate the producer reg dealloc.
-    """
+    """Annotate the producer reg dealloc."""
     return dec_max_nreg(reg_count)
 
 
 def annotate_consumer_reg_alloc(reg_count: int = 240):
-    """Annotate the consumer reg alloc.
-    """
+    """Annotate the consumer reg alloc."""
     return inc_max_nreg(reg_count)
 
 
 def no_set_max_nreg():
-    """Disable the maximum register limit setting.
-    """
+    """Disable the maximum register limit setting."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.no_set_max_nreg"))
 
 
 def disable_warp_group_reg_alloc():
-    """Disable the warp group reg alloc.
-    """
+    """Disable the warp group reg alloc."""
     return no_set_max_nreg()
 
 
-def mbarrier_wait_parity(mbarrier: int | PrimExpr | tir.Call, parity: int | Var):
+def mbarrier_wait_parity(mbarrier: BarrierType, parity: int | Var):
     """Wait for memory barrier parity condition.
 
     Args:
-        mbarrier: Optional[int, PrimExpr]
+            mbarrier: BarrierType
             The memory barrier to wait on
-        parity: Optional[int, Var]
+        parity: int | Var
             The parity value to wait for
     Examples:
         .. code-block:: python
 
-            # Wait for parity 0 on barrier 0
-            T.mbarrier_wait_parity(0, 0)
-
-            # Wait for parity value in variable ko on barrier 1
-            T.mbarrier_wait_parity(1, ko)
+            mbar = T.alloc_barrier(1)
+            # Wait for parity 0 on a single mbarrier
+            T.mbarrier_wait_parity(mbar, 0)
 
-            # Wait using barrier handle
-            barrier = T.get_mbarrier(0)
-            T.mbarrier_wait_parity(barrier, 1)
+            mbars = T.alloc_barrier([128] * n)
+            # Wait for parity value on one of the mbarriers
+            T.mbarrier_wait_parity(mbars[ko], ko)
 
             # Common usage in pipelined kernels:
             for ko in range(num_stages):
                 # Producer waits for consumer to finish previous iteration
-                T.mbarrier_wait_parity(1, ko ^ 1)
+                T.mbarrier_wait_parity(mbars[1], ko ^ 1)
                 # Producer copies data
                 T.copy(A_global, A_shared)
                 # Producer signals data ready
-                T.mbarrier_arrive(0)
+                T.mbarrier_arrive(mbars[0])
 
                 # Consumer waits for producer data
-                T.mbarrier_wait_parity(0, ko)
+                T.mbarrier_wait_parity(mbars[0], ko)
                 # Consumer computes
                 T.gemm(A_shared, B_shared, C_local)
                 # Consumer signals completion
-                T.mbarrier_arrive(1)
+                T.mbarrier_arrive(mbars[1])
     Returns:
         tir.Call: A handle to the barrier wait operation
     """
-    if isinstance(mbarrier, (tir.Call, tir.BufferLoad)):
-        mbarrier = mbarrier
-    elif isinstance(mbarrier, (tir.PrimExpr, int)):
-        mbarrier = get_mbarrier(mbarrier)
-    elif isinstance(mbarrier, tir.Buffer):
-        mbarrier = tir.BufferLoad(mbarrier, [0])
-    else:
-        raise TypeError(f"mbarrier must be an integer or a tir.Call, but got {type(mbarrier)}")
+    mbarrier = _mbar_to_buffer_load(mbarrier)
     return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_wait_parity"), mbarrier, parity)
 
 
-def mbarrier_arrive(mbarrier: int | PrimExpr | tir.Call):
+def mbarrier_arrive(mbarrier: BarrierType):
     """Arrive at memory barrier.
 
     Args:
-        mbarrier: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to arrive at
     """
-    if isinstance(mbarrier, (tir.Call, tir.BufferLoad)):
-        mbarrier = mbarrier
-    elif isinstance(mbarrier, (tir.PrimExpr, int)):
-        mbarrier = get_mbarrier(mbarrier)
-    elif isinstance(mbarrier, tir.Buffer):
-        mbarrier = tir.BufferLoad(mbarrier, [0])
-    else:
-        raise TypeError(f"mbarrier must be an integer or a tir.Call, but got {type(mbarrier)}")
+    mbarrier = _mbar_to_buffer_load(mbarrier)
     return ptx_arrive_barrier(mbarrier)
 
 
-def mbarrier_expect_tx(*args):
+def mbarrier_expect_tx(mbarrier: BarrierType, tx: int):
     """Set expected transaction count for memory barrier.
 
     Args:
-        *args: Variable arguments specifying the expected transaction count
+        mbarrier: BarrierType
+            The memory barrier to expect transaction count for
+        tx: int
+            The expected transaction count
 
     Returns:
         tir.Call: A handle to the barrier expectation operation
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_expect_tx"), *args)
+    mbarrier = _mbar_to_buffer_load(mbarrier)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_expect_tx"), mbarrier, tx)
 
 
 def warpgroup_arrive():
@@ -296,7 +288,9 @@ def warpgroup_wait(num_mma: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.warpgroup_wait"), num_mma)
 
 
-def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_lane_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the logical lane index of the calling thread within a warp.
 
     Parameters
@@ -321,7 +315,9 @@ def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_lane_idx"), warp_size_expr)
 
 
-def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx_sync(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index, assuming the warp's threads are converged.
 
     Parameters
@@ -345,7 +341,9 @@ def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_idx_sync"), warp_size_expr)
 
 
-def get_warp_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index without synchronizing the warp.
 
     Parameters
@@ -400,8 +398,7 @@ def get_warp_group_idx(
         args.append(warp_size_expr)
     if warps_per_group_expr is not None:
         if warp_size_expr is None:
-            raise ValueError("get_warp_group_idx expects `warp_size` when specifying "
-                             "`warps_per_group`.")
+            raise ValueError("get_warp_group_idx expects `warp_size` when specifying `warps_per_group`.")
         args.append(warps_per_group_expr)
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_group_idx"), *args)
 
@@ -430,17 +427,19 @@ def shuffle_elect(thread_extent: int) -> PrimExpr:
     return tir.call_intrin("bool", tir.op.Op.get("tl.tl_shuffle_elect"), thread_extent)
 
 
-def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
-                            offset: int | PrimExpr = 0,
-                            num_regs: int | PrimExpr | None = None,
-                            dtype: str | None = None):
+def warpgroup_fence_operand(
+    buffer_or_ptr: BufferLikeType | PrimExpr,
+    offset: int | PrimExpr = 0,
+    num_regs: int | PrimExpr | None = None,
+    dtype: DType | None = None,
+):
     """Insert a warpgroup fence for the destination accumulator registers.
 
     This prevents NVCC from sinking uses of accumulator fragments past the corresponding
     WGMMA operations by issuing an empty inline assembly barrier on every register.
 
     Args:
-        buffer_or_ptr: Buffer | BufferLoad | BufferRegion | PrimExpr
+        buffer_or_ptr: BufferLikeType | PrimExpr
             A buffer representing the accumulator fragment, a buffer load/region
             that identifies a starting element within the fragment, or a pointer expression
             (e.g., tvm_access_ptr/address_of/typed Var).
@@ -449,7 +448,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
         num_regs: int | PrimExpr | None
             Number of 32-bit registers to fence. If None and a Buffer is provided, it will be
             derived from the buffer shape and dtype.
-        dtype: str | None
+        dtype: DType | None
             Data type string of the accumulator elements. When passing a buffer or
             buffer-derived expression, dtype is inferred. It is required only when
             passing a raw pointer expression that cannot be inferred.
@@ -488,7 +487,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
 
     if isinstance(buffer_or_ptr, tir.Buffer):
         data_ptr = buffer_or_ptr.data
@@ -502,8 +502,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 if isinstance(dim, tir.IntImm):
                     total_elems *= int(dim)
                 else:
-                    raise ValueError(
-                        "warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
+                    raise ValueError("warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
             bits_per_elem = DataType(dtype).bits
             num_regs = (total_elems * bits_per_elem + 31) // 32
     elif isinstance(buffer_or_ptr, BufferRegion):
@@ -540,9 +539,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 bits_per_elem = DataType(dtype).bits
                 num_regs = (total_elems * bits_per_elem + 31) // 32
             else:
-                raise ValueError(
-                    "warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic."
-                )
+                raise ValueError("warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic.")
         return evaluate(
             tir.call_intrin(
                 "handle",
@@ -551,7 +548,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
     else:
         data_ptr = buffer_or_ptr
         # Try to infer dtype from common pointer expressions when not provided
@@ -574,9 +572,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 except Exception:
                     inferred = None
             if inferred is None:
-                raise ValueError(
-                    "dtype must be provided when passing a pointer expression and cannot be inferred."
-                )
+                raise ValueError("dtype must be provided when passing a pointer expression and cannot be inferred.")
             dtype = inferred
         if num_regs is None:
             raise ValueError("num_regs must be provided when passing a pointer expression.")
@@ -589,7 +585,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
             data_ptr,
             convert(offset),
             convert(num_regs),
-        ))
+        )
+    )
 
 
 def wait_wgmma(id: int):
@@ -605,29 +602,29 @@ def wait_wgmma(id: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.wait_wgmma"), id)
 
 
-def barrier_wait(barrier_id: int | PrimExpr | tir.Call, parity: int | Var | None = None):
+def barrier_wait(mbarrier: BarrierType, parity: int | Var):
     """Wait for a memory barrier to complete.
 
     Args:
-        barrier_id: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to wait on
-        parity: Optional[int, Var]
+        parity: int | Var
             The parity value to wait for
     Returns:
         tir.Call: A handle to the barrier wait operation
     Current implementation is a sugar syntax for mbarrier_wait_parity, as we only support parity 0 and 1.
     """
-    return mbarrier_wait_parity(barrier_id, parity)
+    return mbarrier_wait_parity(mbarrier, parity)
 
 
-def barrier_arrive(barrier_id: int | PrimExpr | tir.Call):
+def barrier_arrive(mbarrier: BarrierType):
     """Arrive at a memory barrier.
 
     Args:
-        barrier_id: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to arrive at
     """
-    return mbarrier_arrive(barrier_id)
+    return mbarrier_arrive(mbarrier)
 
 
 def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -644,7 +641,7 @@ def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_xor", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -657,7 +654,7 @@ def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Cal
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_down", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -670,12 +667,11 @@ def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call)
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_up", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xFFFFFFFF, value, offset)
 
 
 def sync_threads(barrier_id: int = None, arrive_count: int = None):
-    """Synchronize all threads in a block.
-    """
+    """Synchronize all threads in a block."""
     args = []
     if barrier_id is not None:
         args.append(barrier_id)
@@ -684,9 +680,22 @@ def sync_threads(barrier_id: int = None, arrive_count: int = None):
     return tir.call_intrin("int32", "tir.tvm_storage_sync", "shared", *args)
 
 
+def sync_warp(mask: int = None):
+    """Synchronize all threads in a warp."""
+    if mask is not None:
+        return tir.call_intrin("void", tir.op.Op.get("tl.sync_warp"), mask)
+    return tir.call_intrin("void", tir.op.Op.get("tl.sync_warp"))
+
+
+def shfl_sync(mask: int, value: int | PrimExpr, srcLane: int, width: int = None):
+    """Receives data from a thread in the same warp."""
+    if width is None:
+        return tir.call_extern(value.dtype, "__shfl_sync", mask, value, srcLane)
+    return tir.call_extern(value.dtype, "__shfl_sync", mask, value, srcLane, width)
+
+
 def sync_global():
-    """Synchronize all threads in the entire grid.
-    """
+    """Synchronize all threads in the entire grid."""
     tx, ty, tz = get_thread_bindings()
     ex, ey, ez = get_block_extents()
     print(tx, ty, tz, ex, ey, ez)
@@ -695,8 +704,7 @@ def sync_global():
 
 
 def sync_grid():
-    """Synchronize all threads in a grid.
-    """
+    """Synchronize all threads in a grid."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid"))
 
 
@@ -712,12 +720,10 @@ def initialize_wgmma_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -728,7 +734,8 @@ def initialize_wgmma_descriptor(
             layout_type_,
             int(leading_byte_offset),
             int(stride_byte_offset),
-        ))
+        )
+    )
 
 
 def initialize_tcgen05_descriptor(
@@ -745,12 +752,10 @@ def initialize_tcgen05_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -763,7 +768,8 @@ def initialize_tcgen05_descriptor(
             int(base_offset),
             tir.IntImm("int32", 1 if leading_is_absolute else 0),
             int(swizzle_mode),
-        ))
+        )
+    )
 
 
 def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimExpr:
@@ -780,28 +786,23 @@ def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimEx
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and len(
-            descriptor.shape) != 1 or descriptor.shape[0] != 1:
+    if isinstance(descriptor, tir.Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
-    return evaluate(
-        tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor,
-                        offset))
+    return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor, offset))
 
 
 def loop_break():
-    """Break out of the innermost loop.
-    """
+    """Break out of the innermost loop."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.loop_break"))
 
 
-def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
-    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
-    """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
+def cp_async_barrier_noinc(barrier: BarrierType):
+    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc."""
+    barrier = _mbar_to_buffer_load(barrier)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier)
 
 
 def tcgen05_mma_arrive(mbar_ptr):
@@ -912,3 +913,199 @@ def ptx_mma_sm70(
         accumulator,
         c_index,
     )
+
+
+def ldg32(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 32 bits (4 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg32(x[i])` or `T.ldg32(x[i:i+2])` emits `tl::ldg32(ptr)`.
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 32-bit value.
+
+    Example:
+        >>> val = T.ldg32(x[i])
+        >>> val = T.ldg32(x[i:i+2])  # load 2 x fp16
+        >>> val = T.ldg32(x[i], pred=i < N)  # predicated load
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg32 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32", tir.op.Op.get("tl.ldg32"), ptr)
+    else:
+        return tir.call_intrin("uint32", tir.op.Op.get("tl.ldg32"), ptr, pred)
+
+
+def ldg64(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 64 bits (8 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg64(x[i])` or `T.ldg64(x[i:i+4])` emits `tl::ldg64(ptr)`.
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 64-bit value.
+
+    Example:
+        >>> val = T.ldg64(x[i])
+        >>> val = T.ldg64(x[i:i+4])  # load 4 x fp16
+        >>> val = T.ldg64(x[i], pred=i < N)  # predicated load
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg64 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ldg64"), ptr)
+    else:
+        return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ldg64"), ptr, pred)
+
+
+def ldg128(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 128 bits (16 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg128(x[i])` or `T.ldg128(x[i:i+8])` emits `tl::ldg128(ptr)`.
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 128-bit value.
+
+    Example:
+        >>> val = T.ldg128(x[i])
+        >>> val = T.ldg128(x[i:i+8])  # load 8 x fp16
+        >>> val = T.ldg128(x[i], pred=i < N)  # predicated load
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg128 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x4", tir.op.Op.get("tl.ldg128"), ptr)
+    else:
+        return tir.call_intrin("uint32x4", tir.op.Op.get("tl.ldg128"), ptr, pred)
+
+
+def ldg256(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 256 bits (32 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg256(x[i])` or `T.ldg256(x[i:i+16])` emits `tl::ldg256(ptr)`.
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 256-bit value.
+
+    Example:
+        >>> val = T.ldg256(x[i])
+        >>> val = T.ldg256(x[i:i+16])  # load 16 x fp16
+        >>> val = T.ldg256(x[i], pred=i < N)  # predicated load
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg256 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x8", tir.op.Op.get("tl.ldg256"), ptr)
+    else:
+        return tir.call_intrin("uint32x8", tir.op.Op.get("tl.ldg256"), ptr, pred)
+
+
+def stg32(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 32 bits (4 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg32(y[i], value)` emits `tl::stg32(ptr, value)`.
+
+    Args:
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 32-bit value to store.
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg32(y[i], val)
+        >>> T.stg32(y[i], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg32 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg32"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg32"), ptr, value, pred)
+
+
+def stg64(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 64 bits (8 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg64(y[i:i+2], value)` emits `tl::stg64(ptr, value)`.
+
+    Args:
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 64-bit value to store (e.g., uint2).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg64(y[i:i+2], val)
+        >>> T.stg64(y[i:i+2], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg64 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg64"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg64"), ptr, value, pred)
+
+
+def stg128(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 128 bits (16 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg128(y[i:i+4], value)` emits `tl::stg128(ptr, value)`.
+
+    Args:
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 128-bit value to store (e.g., uint4).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg128(y[i:i+4], val)
+        >>> T.stg128(y[i:i+4], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg128 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg128"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg128"), ptr, value, pred)
+
+
+def stg256(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 256 bits (32 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg256(y[i:i+8], value)` emits `tl::stg256(ptr, value)`.
+
+    Args:
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 256-bit value to store (e.g., ulonglong4).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg256(y[i:i+8], val)
+        >>> T.stg256(y[i:i+8], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg256 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg256"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg256"), ptr, value, pred)
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
deleted file mode 100644
index 84444b8c6..000000000
--- a/tilelang/language/copy.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
-
-from typing import Literal
-from tilelang import language as T
-from tilelang.utils.language import get_buffer_region_from_load
-from tvm import ir, tir
-from tilelang.language.utils import buffer_to_tile_region, buffer_region_to_tile_region, buffer_load_to_tile_region
-
-
-def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-         dst: tir.Buffer | tir.BufferLoad,
-         coalesced_width: int | None = None,
-         disable_tma: bool = False,
-         eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
-    """Copy data between memory regions.
-
-    Args:
-        src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
-        dst (Union[tir.Buffer, tir.BufferLoad]): Destination memory region
-        coalesced_width (Optional[int], optional): Width for coalesced memory access. Defaults to None.
-
-    Raises:
-        TypeError: If copy extents cannot be deduced from arguments
-
-    Returns:
-        tir.Call: A handle to the copy operation
-    """
-    if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
-        ir.assert_structural_equal(src.shape, dst.shape)
-
-    def get_extent(data):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            return data.shape
-        elif isinstance(data, tir.BufferRegion):
-            return [x.extent for x in data.region]
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return None
-            return [x.extent for x in region.region]
-        else:
-            return None
-
-    src_extent = get_extent(src)
-    dst_extent = get_extent(dst)
-    # Combine the nested if statements into a single if statement as suggested by SIM102
-    if (src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and
-            isinstance(dst, tir.BufferLoad)):
-        # check if the case is like this:
-        # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
-        # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
-        return tir.BufferStore(dst.buffer, src, dst.indices)
-
-    assert src_extent or dst_extent, "Can't deduce copy extents from args"
-    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
-    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
-    extent = max(src_extent, dst_extent)
-
-    def _to_region(data, access_type):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            return buffer_to_tile_region(data, access_type)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
-
-    src = _to_region(src, "r")
-    dst = _to_region(dst, "w")
-
-    if coalesced_width is None:
-        coalesced_width = -1  # PrimExpr can not be None
-    if eviction_policy is None:
-        eviction_policy = 0
-    else:
-        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width,
-                           disable_tma, eviction_policy)
-
-
-def c2d_im2col(img: tir.Buffer,
-               col: tir.Buffer,
-               nhw_step: tir.PrimExpr,
-               c_step: tir.PrimExpr,
-               kernel: int,
-               stride: int,
-               dilation: int,
-               pad: int,
-               eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
-    """Perform im2col transformation for 2D convolution.
-
-    Args:
-        img (tir.Buffer): Input image buffer
-        col (tir.Buffer): Output column buffer
-        nhw_step (tir.PrimExpr): Step size for batch and spatial dimensions
-        c_step (tir.PrimExpr): Step size for channel dimension
-        kernel (int): Kernel size
-        stride (int): Stride of the convolution
-        dilation (int): Dilation rate
-        pad (int): Padding size
-
-    Returns:
-        tir.Call: A handle to the im2col operation
-    """
-    if eviction_policy is None:
-        eviction_policy = 0
-    else:
-        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img.access_ptr("r"),
-                           col.access_ptr("w"), nhw_step, c_step, kernel, stride, dilation, pad,
-                           eviction_policy)
diff --git a/tilelang/language/copy_op.py b/tilelang/language/copy_op.py
new file mode 100644
index 000000000..fc69b3bbc
--- /dev/null
+++ b/tilelang/language/copy_op.py
@@ -0,0 +1,148 @@
+"""Copy operations exposed on the TileLang language surface."""
+
+from __future__ import annotations
+from typing import Literal, Any
+from tilelang._typing import BufferLikeType
+from tilelang.utils.language import (
+    to_buffer_region,
+    legalize_pairwise_extents,
+)
+from tilelang.language.utils import get_extent
+from tvm import ir, tir
+
+
+def copy(
+    src: BufferLikeType,
+    dst: BufferLikeType,
+    *,
+    coalesced_width: int | None = None,
+    disable_tma: bool = False,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+    annotations: dict | None = None,
+    loop_layout: Any | None = None,
+) -> tir.PrimExpr | tir.Stmt:
+    """Copy data between memory regions.
+
+    Args:
+        src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
+        dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
+        coalesced_width (Optional[int], keyword-only): Width for coalesced memory access. Defaults to None.
+        disable_tma (bool, keyword-only): Whether to disable TMA acceleration. Defaults to False.
+        eviction_policy (Optional[str], keyword-only): Cache eviction policy. Defaults to None.
+        annotations (Optional[dict], keyword-only): Additional annotations dict. If provided,
+            coalesced_width, disable_tma, and eviction_policy can also be specified here.
+            Values in annotations take precedence over individual arguments.
+        loop_layout (Optional[Fragment], keyword-only): A parallel loop layout hint for the SIMT copy
+            (only valid for normal SIMT copy; incompatible with TMA/LDSM/STSM/TMem). When provided,
+            it is attached to the outermost parallel loop generated by this copy.
+
+    Raises:
+        TypeError: If copy extents cannot be deduced from arguments
+
+    Returns:
+        tir.Call: A handle to the copy operation
+
+    Range handling notes:
+    - Accepts `Buffer`/`BufferRegion`/`BufferLoad` on either side. Extents are
+      derived as follows: `Buffer -> shape`, `BufferRegion -> [r.extent]`,
+      `BufferLoad -> extents from its inferred/encoded region`.
+    - If both `src` and `dst` are scalar `BufferLoad` without region extents,
+      lowers to a direct store: `dst[...] = src`.
+    - If one side is missing extents, it is treated as all-ones with the other
+      side's rank to enable broadcasting.
+    - Extents are right-aligned and legalized via `legalize_pairwise_extents`:
+      per tail-dimension, equal keeps as-is, a `1` broadcasts to the other,
+      otherwise a conservative `tir.max` is used to remain safe for dynamic
+      shapes.
+    - The finalized extents are encoded with `tl.region` via `to_buffer_region`
+      and passed through to the backend; low-level loop construction and any
+      scope-specific decisions happen during lowering.
+    """
+    if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
+        ir.assert_structural_equal(src.shape, dst.shape)
+
+    src_extent = get_extent(src)
+    dst_extent = get_extent(dst)
+    # Combine the nested if statements into a single if statement as suggested by SIM102
+    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
+        # check if the case is like this:
+        # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
+        # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
+        return tir.BufferStore(dst.buffer, src, dst.indices)
+
+    assert src_extent or dst_extent, "Can't deduce copy extents from args"
+    # Treat missing extent as length-matched ones to enable broadcasting.
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+
+    # Align and broadcast extents from the right (tail) side.
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    # Use legalized extents for src and dst respectively.
+    src = to_buffer_region(src, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    # Build annotations dict
+    ann = annotations.copy() if annotations else {}
+
+    # Individual arguments take lower precedence than annotations
+    if "coalesced_width" not in ann and coalesced_width is not None:
+        ann["coalesced_width"] = coalesced_width
+    if "disable_tma" not in ann and disable_tma:
+        ann["disable_tma"] = disable_tma
+    if "eviction_policy" not in ann and eviction_policy is not None:
+        eviction_policy_map = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}
+        ann["eviction_policy"] = eviction_policy_map[eviction_policy]
+
+    # Parallel loop layout hint (Fragment). Mirrors T.Parallel(loop_layout=...)
+    if loop_layout is not None and "parallel_loop_layout" not in ann:
+        ann["parallel_loop_layout"] = loop_layout
+
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, annotations=ann if ann else None)
+
+
+def c2d_im2col(
+    img: BufferLikeType,
+    col: BufferLikeType,
+    nhw_step: tir.PrimExpr,
+    c_step: tir.PrimExpr,
+    kernel: int,
+    stride: int,
+    dilation: int,
+    pad: int,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+) -> tir.PrimExpr:
+    """Perform im2col transformation for 2D convolution.
+
+    Args:
+        img (tir.Buffer): Input image buffer
+        col (tir.Buffer): Output column buffer
+        nhw_step (tir.PrimExpr): Step size for batch and spatial dimensions
+        c_step (tir.PrimExpr): Step size for channel dimension
+        kernel (int): Kernel size
+        stride (int): Stride of the convolution
+        dilation (int): Dilation rate
+        pad (int): Padding size
+
+    Returns:
+        tir.Call: A handle to the im2col operation
+    """
+    if eviction_policy is None:
+        eviction_policy = 0
+    else:
+        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
+    img_region = to_buffer_region(img, access_type="r")
+    col_region = to_buffer_region(col, access_type="w")
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.c2d_im2col"),
+        img_region,
+        col_region,
+        nhw_step,
+        c_step,
+        kernel,
+        stride,
+        dilation,
+        pad,
+        eviction_policy,
+    )
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 0830c22dc..fa557560c 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,8 +1,10 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Some customized operations frequently used in tensor programming, exposed on the TileLang language surface."""
 
+from __future__ import annotations
+from tilelang._typing import ShapeType, DType
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
+from tilelang.utils.language import bits_product, prim_expr_equal
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
@@ -36,33 +38,36 @@ def clamp(dst: PrimExpr, min_val: PrimExpr, max_val: PrimExpr) -> PrimExpr:
     return dst
 
 
-def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
+def reshape(src: Buffer, shape: ShapeType) -> Buffer:
     """Reshapes the input buffer to the specified shape.
 
     Args:
         src (Buffer): Input buffer to be reshaped
-        shape (List[PrimExpr]): New shape for the buffer
+        shape (ShapeType): New shape for the buffer
 
     Returns:
         Buffer: A new buffer view with the specified shape
     """
+    assert prim_expr_equal(bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)), (
+        f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
+    )
     return T.Tensor(shape, src.dtype, src.data)
 
 
-def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = None) -> Buffer:
-    """
-         Return a Tensor view of the input buffer with an optional new shape and dtype.
+def view(src: Buffer, shape: ShapeType | None = None, dtype: DType | None = None) -> Buffer:
+    """Return a Tensor view of the input buffer with an optional new shape and dtype.
 
-         If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
-         """
+    If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
+    """
     if shape is None:
         shape = src.shape
     if dtype is None:
         dtype = src.dtype
+    assert prim_expr_equal(bits_product(shape, dtype), bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, dtype, src.data)
 
 
-def loop_break():
+def loop_break() -> PrimExpr:
     """Break out of the current loop.
 
     Returns:
diff --git a/tilelang/language/dtypes.py b/tilelang/language/dtypes.py
new file mode 100644
index 000000000..3d346ac1a
--- /dev/null
+++ b/tilelang/language/dtypes.py
@@ -0,0 +1,749 @@
+from tilelang import tvm
+from tvm import ir
+import torch
+from typing import Generic, TypeVar, Union, TYPE_CHECKING
+from tvm import tir
+import tvm.script.ir_builder.tir._ffi_api as tb_ffi
+import numpy as np
+from tilelang import logger
+
+_T = TypeVar("_T")
+
+if TYPE_CHECKING:
+
+    class dtype(Generic[_T]):
+        @property
+        def bits(self) -> int: ...
+        @property
+        def bytes(self) -> int: ...
+        def as_torch(self) -> torch.dtype: ...
+else:
+    dtype = tvm.DataType
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
+
+_PYTHON_DTYPE_TO_STR = {
+    bool: "bool",
+    int: "int32",
+    float: "float32",
+}
+
+_NUMPY_DTYPE_TO_STR = {
+    np.bool_: "bool",
+    np.short: "int16",
+    np.int_: "int64",
+    np.longlong: "int64",
+    np.half: "float16",
+    np.double: "float64",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "float16",
+    np.float32: "float32",
+    np.float64: "float64",
+}
+
+_NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
+
+_TORCH_DTYPE_TO_STR = {
+    torch.bool: "bool",
+    torch.short: "int16",
+    torch.int: "int32",
+    torch.long: "int64",
+    torch.half: "float16",
+    torch.float: "float32",
+    torch.double: "float64",
+    torch.int8: "int8",
+    torch.int16: "int16",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.uint8: "uint8",
+    torch.uint16: "uint16",
+    torch.uint32: "uint32",
+    torch.uint64: "uint64",
+    torch.float16: "float16",
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.bfloat16: "bfloat16",
+}
+
+_extended_torch_dtypes = [
+    ("float8_e4m3fn",),
+    ("float8_e4m3fnuz",),
+    ("float8_e5m2",),
+    ("float8_e5m2fnuz",),
+    ("float8_e8m0fnu",),
+    ("float4_e2m1fnx2",),
+]
+for dtype_name_tuple in _extended_torch_dtypes:
+    dtype_name = dtype_name_tuple[0]
+    torch_dtype = None
+    if dtype_name == "float4_e2m1fnx2":
+        torch_dtype = getattr(torch, "float4_e2m1fn_x2", None)
+    else:
+        torch_dtype = getattr(torch, dtype_name, None)
+
+    if torch_dtype is not None:
+        _TORCH_DTYPE_TO_STR[torch_dtype] = dtype_name
+
+
+_CANONICAL_TO_DISPLAY_STR = {
+    "double": "float64",
+    "float": "float32",
+    "int": "int32",
+    "long": "int64",
+    "short": "int16",
+    "uint": "uint32",
+    "ulong": "uint64",
+}
+
+_STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
+
+# _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
+
+_DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
+
+_STR_TO_TVM_DTYPE_CALL = {
+    "bool": "Boolean",
+    "int4": "Int4",
+    "int8": "Int8",
+    "int16": "Int16",
+    "int32": "Int32",
+    "int64": "Int64",
+    "uint8": "UInt8",
+    "uint16": "UInt16",
+    "uint32": "UInt32",
+    "uint64": "UInt64",
+    "float16": "Float16",
+    "float32": "Float32",
+    "float64": "Float64",
+    "bfloat16": "BFloat16",
+    "float8_e4m3": "Float8E4M3",
+    "float8_e4m3fn": "Float8E4M3FN",
+    "float8_e4m3fnuz": "Float8E4M3FNUZ",
+    "float8_e5m2": "Float8E5M2",
+    "float8_e5m2fnuz": "Float8E5M2FNUZ",
+    "float8_e8m0fnu": "Float8E8M0FNU",
+}
+
+int_ = int
+
+
+def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
+    if isinstance(expr, int_):
+        return tvm.tir.const(expr, dtype=self)
+    if self in _STR_TO_TVM_DTYPE_CALL:
+        attr = _STR_TO_TVM_DTYPE_CALL[self]
+        call = getattr(tb_ffi, attr, None)
+        return call(expr, is_size_var)
+    # try to construct the ffi call
+    if self.startswith("uint"):
+        val = "UInt" + self[4:]
+    elif self.startswith("int"):
+        val = "Int" + self[3:]
+    elif self.startswith("float"):
+        val = "Float" + self[5:]
+    elif self.startswith("bfloat"):
+        val = "BFloat" + self[6:]
+    else:
+        raise TypeError(f"Invalid type {self}")
+    if "_" in val:
+        first, second = val.split("_", maxsplit=1)
+        val = first + second.upper()
+    call = getattr(tb_ffi, val, None)
+    if call is None:
+        raise TypeError(
+            f"Convert to datatype `{self}` is not supported by tvm\ncalling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`"
+        )
+    return call(expr, is_size_var)
+
+
+def __dtype_as_torch__(self: dtype) -> torch.dtype:
+    """Convert TileLang dtype to PyTorch dtype."""
+    dtype_str = str(self)
+
+    if dtype_str == "float8_e4m3":
+        # Check if we're on HIP (AMD ROCm) or CUDA
+        if torch.version.hip is not None:
+            # HIP backend - use float8_e4m3fnuz
+            assert hasattr(torch, "float8_e4m3fnuz"), (
+                "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+            )
+            return torch.float8_e4m3fnuz
+        else:
+            # CUDA backend - use float8_e4m3fn
+            assert hasattr(torch, "float8_e4m3fn"), (
+                "torch.float8_e4m3fn is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+            )
+            return torch.float8_e4m3fn
+    elif dtype_str == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif dtype_str == "float8_e4m3fnuz":
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif dtype_str == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif dtype_str == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fn_x2
+    elif dtype_str == "float4_e2m1fn":
+        logger.info("torch doesn't support float4_e2m1fn, using float4_e2m1fnx2 as storage dtype.")
+        return torch.float4_e2m1fn_x2 if hasattr(torch, "float4_e2m1fn_x2") else torch.int8
+    elif dtype_str == "handle":
+        return None
+    elif dtype_str in _STR_TO_TORCH_DTYPE:
+        return _STR_TO_TORCH_DTYPE[dtype_str]
+
+    raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
+
+
+__orig_dtype_new = dtype.__new__
+
+
+def __dtype_new__(cls, value: AnyDType) -> dtype:
+    if isinstance(value, str):
+        return __orig_dtype_new(cls, _CANONICAL_TO_DISPLAY_STR.get(value, value))
+    elif value in _DTYPE_TO_STR:
+        return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
+    else:
+        expected = set(list(_DTYPE_TO_STR.keys()) + list(_DTYPE_TO_STR.values()))
+        raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
+
+
+def __dtype_bytes__(self: dtype) -> int:
+    """Return the number of bytes for this dtype."""
+    return self.itemsize
+
+
+dtype.__call__ = __dtype_call__
+dtype.__new__ = __dtype_new__
+dtype.as_torch = __dtype_as_torch__
+dtype.bytes = property(__dtype_bytes__)
+
+
+def get_tvm_dtype(value: AnyDType) -> dtype:
+    if isinstance(value, (dtype, ir.Type)):
+        return value
+    return dtype(value)
+
+
+if TYPE_CHECKING:
+    # yapf: disable
+    class bool(dtype): ...
+    class short(dtype): ...
+    class int(dtype): ...
+    class uint(dtype): ...
+    class long(dtype): ...
+    class half(dtype): ...
+    class float(dtype): ...
+    class double(dtype): ...
+    class int4(dtype): ...
+    class int8(dtype): ...
+    class int16(dtype): ...
+    class int32(dtype): ...
+    class int64(dtype): ...
+    class int8x2(dtype): ...
+    class int16x2(dtype): ...
+    class int32x2(dtype): ...
+    class int64x2(dtype): ...
+    class int8x4(dtype): ...
+    class int16x4(dtype): ...
+    class int32x4(dtype): ...
+    class int64x4(dtype): ...
+    class int8x8(dtype): ...
+    class int16x8(dtype): ...
+    class int32x8(dtype): ...
+    class int64x8(dtype): ...
+    class int8x16(dtype): ...
+    class int16x16(dtype): ...
+    class int32x16(dtype): ...
+    class int64x16(dtype): ...
+    class int8x32(dtype): ...
+    class int16x32(dtype): ...
+    class int32x32(dtype): ...
+    class int64x32(dtype): ...
+    class int8x64(dtype): ...
+    class int16x64(dtype): ...
+    class int32x64(dtype): ...
+    class int64x64(dtype): ...
+    class uint8(dtype): ...
+    class uint16(dtype): ...
+    class uint32(dtype): ...
+    class uint64(dtype): ...
+    class uint8x2(dtype): ...
+    class uint16x2(dtype): ...
+    class uint32x2(dtype): ...
+    class uint64x2(dtype): ...
+    class uint8x4(dtype): ...
+    class uint16x4(dtype): ...
+    class uint32x4(dtype): ...
+    class uint64x4(dtype): ...
+    class uint8x8(dtype): ...
+    class uint16x8(dtype): ...
+    class uint32x8(dtype): ...
+    class uint64x8(dtype): ...
+    class uint8x16(dtype): ...
+    class uint16x16(dtype): ...
+    class uint32x16(dtype): ...
+    class uint64x16(dtype): ...
+    class uint8x32(dtype): ...
+    class uint16x32(dtype): ...
+    class uint32x32(dtype): ...
+    class uint64x32(dtype): ...
+    class uint8x64(dtype): ...
+    class uint16x64(dtype): ...
+    class uint32x64(dtype): ...
+    class uint64x64(dtype): ...
+    class float16(dtype): ...
+    class float32(dtype): ...
+    class float64(dtype): ...
+    class float16x2(dtype): ...
+    class float32x2(dtype): ...
+    class float64x2(dtype): ...
+    class float16x4(dtype): ...
+    class float32x4(dtype): ...
+    class float64x4(dtype): ...
+    class float16x8(dtype): ...
+    class float32x8(dtype): ...
+    class float64x8(dtype): ...
+    class float16x16(dtype): ...
+    class float32x16(dtype): ...
+    class float64x16(dtype): ...
+    class float16x32(dtype): ...
+    class float32x32(dtype): ...
+    class float64x32(dtype): ...
+    class float16x64(dtype): ...
+    class float32x64(dtype): ...
+    class float64x64(dtype): ...
+    class float8_e3m4(dtype): ...
+    class float8_e3m4x2(dtype): ...
+    class float8_e3m4x4(dtype): ...
+    class float8_e3m4x8(dtype): ...
+    class float8_e3m4x16(dtype): ...
+    class float8_e3m4x32(dtype): ...
+    class float8_e3m4x64(dtype): ...
+    class float8_e4m3(dtype): ...
+    class float8_e4m3x2(dtype): ...
+    class float8_e4m3x4(dtype): ...
+    class float8_e4m3x8(dtype): ...
+    class float8_e4m3x16(dtype): ...
+    class float8_e4m3x32(dtype): ...
+    class float8_e4m3x64(dtype): ...
+    class float8_e4m3b11fnuz(dtype): ...
+    class float8_e4m3b11fnuzx2(dtype): ...
+    class float8_e4m3b11fnuzx4(dtype): ...
+    class float8_e4m3b11fnuzx8(dtype): ...
+    class float8_e4m3b11fnuzx16(dtype): ...
+    class float8_e4m3b11fnuzx32(dtype): ...
+    class float8_e4m3b11fnuzx64(dtype): ...
+    class float8_e4m3fn(dtype): ...
+    class float8_e4m3fnx2(dtype): ...
+    class float8_e4m3fnx4(dtype): ...
+    class float8_e4m3fnx8(dtype): ...
+    class float8_e4m3fnx16(dtype): ...
+    class float8_e4m3fnx32(dtype): ...
+    class float8_e4m3fnx64(dtype): ...
+    class float8_e4m3fnuz(dtype): ...
+    class float8_e4m3fnuzx2(dtype): ...
+    class float8_e4m3fnuzx4(dtype): ...
+    class float8_e4m3fnuzx8(dtype): ...
+    class float8_e4m3fnuzx16(dtype): ...
+    class float8_e4m3fnuzx32(dtype): ...
+    class float8_e4m3fnuzx64(dtype): ...
+    class float8_e5m2(dtype): ...
+    class float8_e5m2x2(dtype): ...
+    class float8_e5m2x4(dtype): ...
+    class float8_e5m2x8(dtype): ...
+    class float8_e5m2x16(dtype): ...
+    class float8_e5m2x32(dtype): ...
+    class float8_e5m2x64(dtype): ...
+    class float8_e5m2fnuz(dtype): ...
+    class float8_e5m2fnuzx2(dtype): ...
+    class float8_e5m2fnuzx4(dtype): ...
+    class float8_e5m2fnuzx8(dtype): ...
+    class float8_e5m2fnuzx16(dtype): ...
+    class float8_e5m2fnuzx32(dtype): ...
+    class float8_e5m2fnuzx64(dtype): ...
+    class float8_e8m0fnu(dtype): ...
+    class float8_e8m0fnux2(dtype): ...
+    class float8_e8m0fnux4(dtype): ...
+    class float8_e8m0fnux8(dtype): ...
+    class float8_e8m0fnux16(dtype): ...
+    class float8_e8m0fnux32(dtype): ...
+    class float8_e8m0fnux64(dtype): ...
+    class float6_e2m3fn(dtype): ...
+    class float6_e2m3fnx2(dtype): ...
+    class float6_e2m3fnx4(dtype): ...
+    class float6_e2m3fnx8(dtype): ...
+    class float6_e2m3fnx16(dtype): ...
+    class float6_e2m3fnx32(dtype): ...
+    class float6_e2m3fnx64(dtype): ...
+    class float6_e3m2fn(dtype): ...
+    class float6_e3m2fnx2(dtype): ...
+    class float6_e3m2fnx4(dtype): ...
+    class float6_e3m2fnx8(dtype): ...
+    class float6_e3m2fnx16(dtype): ...
+    class float6_e3m2fnx32(dtype): ...
+    class float6_e3m2fnx64(dtype): ...
+    class float4_e2m1fn(dtype): ...
+    class float4_e2m1fnx2(dtype): ...
+    class float4_e2m1fnx4(dtype): ...
+    class float4_e2m1fnx8(dtype): ...
+    class float4_e2m1fnx16(dtype): ...
+    class float4_e2m1fnx32(dtype): ...
+    class float4_e2m1fnx64(dtype): ...
+    class bfloat16(dtype): ...
+    # yapf: enable
+
+else:
+    bool = dtype("bool")
+    short = dtype("int16")
+    int = dtype("int32")
+    uint = dtype("uint32")
+    long = dtype("int64")
+    half = dtype("float16")
+    float = dtype("float32")
+    double = dtype("float64")
+    int4 = dtype("int4")
+    int8 = dtype("int8")
+    int16 = dtype("int16")
+    int32 = dtype("int32")
+    int64 = dtype("int64")
+    int8x2 = dtype("int8x2")
+    int16x2 = dtype("int16x2")
+    int32x2 = dtype("int32x2")
+    int64x2 = dtype("int64x2")
+    int8x4 = dtype("int8x4")
+    int16x4 = dtype("int16x4")
+    int32x4 = dtype("int32x4")
+    int64x4 = dtype("int64x4")
+    int8x8 = dtype("int8x8")
+    int16x8 = dtype("int16x8")
+    int32x8 = dtype("int32x8")
+    int64x8 = dtype("int64x8")
+    int8x16 = dtype("int8x16")
+    int16x16 = dtype("int16x16")
+    int32x16 = dtype("int32x16")
+    int64x16 = dtype("int64x16")
+    int8x32 = dtype("int8x32")
+    int16x32 = dtype("int16x32")
+    int32x32 = dtype("int32x32")
+    int64x32 = dtype("int64x32")
+    int8x64 = dtype("int8x64")
+    int16x64 = dtype("int16x64")
+    int32x64 = dtype("int32x64")
+    int64x64 = dtype("int64x64")
+    uint8 = dtype("uint8")
+    uint16 = dtype("uint16")
+    uint32 = dtype("uint32")
+    uint64 = dtype("uint64")
+    uint8x2 = dtype("uint8x2")
+    uint16x2 = dtype("uint16x2")
+    uint32x2 = dtype("uint32x2")
+    uint64x2 = dtype("uint64x2")
+    uint8x4 = dtype("uint8x4")
+    uint16x4 = dtype("uint16x4")
+    uint32x4 = dtype("uint32x4")
+    uint64x4 = dtype("uint64x4")
+    uint8x8 = dtype("uint8x8")
+    uint16x8 = dtype("uint16x8")
+    uint32x8 = dtype("uint32x8")
+    uint64x8 = dtype("uint64x8")
+    uint8x16 = dtype("uint8x16")
+    uint16x16 = dtype("uint16x16")
+    uint32x16 = dtype("uint32x16")
+    uint64x16 = dtype("uint64x16")
+    uint8x32 = dtype("uint8x32")
+    uint16x32 = dtype("uint16x32")
+    uint32x32 = dtype("uint32x32")
+    uint64x32 = dtype("uint64x32")
+    uint8x64 = dtype("uint8x64")
+    uint16x64 = dtype("uint16x64")
+    uint32x64 = dtype("uint32x64")
+    uint64x64 = dtype("uint64x64")
+    float16 = dtype("float16")
+    float32 = dtype("float32")
+    float64 = dtype("float64")
+    float16x2 = dtype("float16x2")
+    float32x2 = dtype("float32x2")
+    float64x2 = dtype("float64x2")
+    float16x4 = dtype("float16x4")
+    float32x4 = dtype("float32x4")
+    float64x4 = dtype("float64x4")
+    float16x8 = dtype("float16x8")
+    float32x8 = dtype("float32x8")
+    float64x8 = dtype("float64x8")
+    float16x16 = dtype("float16x16")
+    float32x16 = dtype("float32x16")
+    float64x16 = dtype("float64x16")
+    float16x32 = dtype("float16x32")
+    float32x32 = dtype("float32x32")
+    float64x32 = dtype("float64x32")
+    float16x64 = dtype("float16x64")
+    float32x64 = dtype("float32x64")
+    float64x64 = dtype("float64x64")
+    float8_e3m4 = dtype("float8_e3m4")
+    float8_e3m4x2 = dtype("float8_e3m4x2")
+    float8_e3m4x4 = dtype("float8_e3m4x4")
+    float8_e3m4x8 = dtype("float8_e3m4x8")
+    float8_e3m4x16 = dtype("float8_e3m4x16")
+    float8_e3m4x32 = dtype("float8_e3m4x32")
+    float8_e3m4x64 = dtype("float8_e3m4x64")
+    float8_e4m3 = dtype("float8_e4m3")
+    float8_e4m3x2 = dtype("float8_e4m3x2")
+    float8_e4m3x4 = dtype("float8_e4m3x4")
+    float8_e4m3x8 = dtype("float8_e4m3x8")
+    float8_e4m3x16 = dtype("float8_e4m3x16")
+    float8_e4m3x32 = dtype("float8_e4m3x32")
+    float8_e4m3x64 = dtype("float8_e4m3x64")
+    float8_e4m3b11fnuz = dtype("float8_e4m3b11fnuz")
+    float8_e4m3b11fnuzx2 = dtype("float8_e4m3b11fnuzx2")
+    float8_e4m3b11fnuzx4 = dtype("float8_e4m3b11fnuzx4")
+    float8_e4m3b11fnuzx8 = dtype("float8_e4m3b11fnuzx8")
+    float8_e4m3b11fnuzx16 = dtype("float8_e4m3b11fnuzx16")
+    float8_e4m3b11fnuzx32 = dtype("float8_e4m3b11fnuzx32")
+    float8_e4m3b11fnuzx64 = dtype("float8_e4m3b11fnuzx64")
+    float8_e4m3fn = dtype("float8_e4m3fn")
+    float8_e4m3fnx2 = dtype("float8_e4m3fnx2")
+    float8_e4m3fnx4 = dtype("float8_e4m3fnx4")
+    float8_e4m3fnx8 = dtype("float8_e4m3fnx8")
+    float8_e4m3fnx16 = dtype("float8_e4m3fnx16")
+    float8_e4m3fnx32 = dtype("float8_e4m3fnx32")
+    float8_e4m3fnx64 = dtype("float8_e4m3fnx64")
+    float8_e4m3fnuz = dtype("float8_e4m3fnuz")
+    float8_e4m3fnuzx2 = dtype("float8_e4m3fnuzx2")
+    float8_e4m3fnuzx4 = dtype("float8_e4m3fnuzx4")
+    float8_e4m3fnuzx8 = dtype("float8_e4m3fnuzx8")
+    float8_e4m3fnuzx16 = dtype("float8_e4m3fnuzx16")
+    float8_e4m3fnuzx32 = dtype("float8_e4m3fnuzx32")
+    float8_e4m3fnuzx64 = dtype("float8_e4m3fnuzx64")
+    float8_e5m2 = dtype("float8_e5m2")
+    float8_e5m2x2 = dtype("float8_e5m2x2")
+    float8_e5m2x4 = dtype("float8_e5m2x4")
+    float8_e5m2x8 = dtype("float8_e5m2x8")
+    float8_e5m2x16 = dtype("float8_e5m2x16")
+    float8_e5m2x32 = dtype("float8_e5m2x32")
+    float8_e5m2x64 = dtype("float8_e5m2x64")
+    float8_e5m2fnuz = dtype("float8_e5m2fnuz")
+    float8_e5m2fnuzx2 = dtype("float8_e5m2fnuzx2")
+    float8_e5m2fnuzx4 = dtype("float8_e5m2fnuzx4")
+    float8_e5m2fnuzx8 = dtype("float8_e5m2fnuzx8")
+    float8_e5m2fnuzx16 = dtype("float8_e5m2fnuzx16")
+    float8_e5m2fnuzx32 = dtype("float8_e5m2fnuzx32")
+    float8_e5m2fnuzx64 = dtype("float8_e5m2fnuzx64")
+    float8_e8m0fnu = dtype("float8_e8m0fnu")
+    float8_e8m0fnux2 = dtype("float8_e8m0fnux2")
+    float8_e8m0fnux4 = dtype("float8_e8m0fnux4")
+    float8_e8m0fnux8 = dtype("float8_e8m0fnux8")
+    float8_e8m0fnux16 = dtype("float8_e8m0fnux16")
+    float8_e8m0fnux32 = dtype("float8_e8m0fnux32")
+    float8_e8m0fnux64 = dtype("float8_e8m0fnux64")
+    float6_e2m3fn = dtype("float6_e2m3fn")
+    float6_e2m3fnx2 = dtype("float6_e2m3fnx2")
+    float6_e2m3fnx4 = dtype("float6_e2m3fnx4")
+    float6_e2m3fnx8 = dtype("float6_e2m3fnx8")
+    float6_e2m3fnx16 = dtype("float6_e2m3fnx16")
+    float6_e2m3fnx32 = dtype("float6_e2m3fnx32")
+    float6_e2m3fnx64 = dtype("float6_e2m3fnx64")
+    float6_e3m2fn = dtype("float6_e3m2fn")
+    float6_e3m2fnx2 = dtype("float6_e3m2fnx2")
+    float6_e3m2fnx4 = dtype("float6_e3m2fnx4")
+    float6_e3m2fnx8 = dtype("float6_e3m2fnx8")
+    float6_e3m2fnx16 = dtype("float6_e3m2fnx16")
+    float6_e3m2fnx32 = dtype("float6_e3m2fnx32")
+    float6_e3m2fnx64 = dtype("float6_e3m2fnx64")
+    float4_e2m1fn = dtype("float4_e2m1fn")
+    float4_e2m1fnx2 = dtype("float4_e2m1fnx2")
+    float4_e2m1fnx4 = dtype("float4_e2m1fnx4")
+    float4_e2m1fnx8 = dtype("float4_e2m1fnx8")
+    float4_e2m1fnx16 = dtype("float4_e2m1fnx16")
+    float4_e2m1fnx32 = dtype("float4_e2m1fnx32")
+    float4_e2m1fnx64 = dtype("float4_e2m1fnx64")
+    bfloat16 = dtype("bfloat16")
+
+_all_dtypes = {
+    "bool",
+    "short",
+    "int",
+    "uint",
+    "long",
+    "half",
+    "float",
+    "double",
+    "int4",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x2",
+    "int16x2",
+    "int32x2",
+    "int64x2",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x2",
+    "uint16x2",
+    "uint32x2",
+    "uint64x2",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float16",
+    "float32",
+    "float64",
+    "float16x2",
+    "float32x2",
+    "float64x2",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float16x64",
+    "float32x64",
+    "float64x64",
+    "float8_e3m4",
+    "float8_e3m4x2",
+    "float8_e3m4x4",
+    "float8_e3m4x8",
+    "float8_e3m4x16",
+    "float8_e3m4x32",
+    "float8_e3m4x64",
+    "float8_e4m3",
+    "float8_e4m3x2",
+    "float8_e4m3x4",
+    "float8_e4m3x8",
+    "float8_e4m3x16",
+    "float8_e4m3x32",
+    "float8_e4m3x64",
+    "float8_e4m3b11fnuz",
+    "float8_e4m3b11fnuzx2",
+    "float8_e4m3b11fnuzx4",
+    "float8_e4m3b11fnuzx8",
+    "float8_e4m3b11fnuzx16",
+    "float8_e4m3b11fnuzx32",
+    "float8_e4m3b11fnuzx64",
+    "float8_e4m3fn",
+    "float8_e4m3fnx2",
+    "float8_e4m3fnx4",
+    "float8_e4m3fnx8",
+    "float8_e4m3fnx16",
+    "float8_e4m3fnx32",
+    "float8_e4m3fnx64",
+    "float8_e4m3fnuz",
+    "float8_e4m3fnuzx2",
+    "float8_e4m3fnuzx4",
+    "float8_e4m3fnuzx8",
+    "float8_e4m3fnuzx16",
+    "float8_e4m3fnuzx32",
+    "float8_e4m3fnuzx64",
+    "float8_e5m2",
+    "float8_e5m2x2",
+    "float8_e5m2x4",
+    "float8_e5m2x8",
+    "float8_e5m2x16",
+    "float8_e5m2x32",
+    "float8_e5m2x64",
+    "float8_e5m2fnuz",
+    "float8_e5m2fnuzx2",
+    "float8_e5m2fnuzx4",
+    "float8_e5m2fnuzx8",
+    "float8_e5m2fnuzx16",
+    "float8_e5m2fnuzx32",
+    "float8_e5m2fnuzx64",
+    "float8_e8m0fnu",
+    "float8_e8m0fnux2",
+    "float8_e8m0fnux4",
+    "float8_e8m0fnux8",
+    "float8_e8m0fnux16",
+    "float8_e8m0fnux32",
+    "float8_e8m0fnux64",
+    "float6_e2m3fn",
+    "float6_e2m3fnx2",
+    "float6_e2m3fnx4",
+    "float6_e2m3fnx8",
+    "float6_e2m3fnx16",
+    "float6_e2m3fnx32",
+    "float6_e2m3fnx64",
+    "float6_e3m2fn",
+    "float6_e3m2fnx2",
+    "float6_e3m2fnx4",
+    "float6_e3m2fnx8",
+    "float6_e3m2fnx16",
+    "float6_e3m2fnx32",
+    "float6_e3m2fnx64",
+    "float4_e2m1fn",
+    "float4_e2m1fnx2",
+    "float4_e2m1fnx4",
+    "float4_e2m1fnx8",
+    "float4_e2m1fnx16",
+    "float4_e2m1fnx32",
+    "float4_e2m1fnx64",
+    "bfloat16",
+}
+
+__all__ = list(_all_dtypes) + [
+    "dtype",
+    "AnyDType",
+    "get_tvm_dtype",
+]
diff --git a/tilelang/language/eager/__init__.py b/tilelang/language/eager/__init__.py
new file mode 100644
index 000000000..171068126
--- /dev/null
+++ b/tilelang/language/eager/__init__.py
@@ -0,0 +1,2 @@
+from .builder import prim_func, macro, PrimFunc, JITFunc, Ref, const  # noqa: F401
+from ..dtypes import *
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/eager/ast.py
similarity index 64%
rename from tilelang/language/v2/ast.py
rename to tilelang/language/eager/ast.py
index a8390cfc3..18b9e5910 100644
--- a/tilelang/language/v2/ast.py
+++ b/tilelang/language/eager/ast.py
@@ -1,19 +1,24 @@
 from __future__ import annotations
 import ast
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Callable, Generic, Any, Literal, TypeVar
 from contextlib import AbstractContextManager
 from collections.abc import Iterable
+
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 import inspect
+
 # from .utils import get_ast, get_compiled_object
 from . import utils
+from .. import dtypes
 
-_span_attrs = ['lineno', 'col_offset', 'end_lineno', 'end_col_offset']
+_span_attrs = ["lineno", "col_offset", "end_lineno", "end_col_offset"]
 
 
 def ast_has_span(ast: ast.AST) -> bool:
@@ -34,7 +39,6 @@ def ast_set_span(ast: ast.AST, span: tuple[int, int, int, int]):
 
 
 class QuoteVisitor(ast.NodeTransformer):
-
     def __init__(self, names: dict[str, ast.AST], passes: list[Any] | None = None, span=None):
         self.names = names
         self.passes = passes or []
@@ -76,9 +80,8 @@ def quote_expr(expr: str, **kws) -> ast.expr:
     return res.value
 
 
-Operator = Literal['Add', 'Sub', 'Mult', 'MatMult', 'Div', 'Mod', 'Pow', 'LShift', 'RShift',
-                   'BitOr', 'BitXor', 'BitAnd', 'FloorDiv']
-BoolOp = Literal['And', 'Or']
+Operator = Literal["Add", "Sub", "Mult", "MatMult", "Div", "Mod", "Pow", "LShift", "RShift", "BitOr", "BitXor", "BitAnd", "FloorDiv"]
+BoolOp = Literal["And", "Or", "Not"]
 
 
 def get_operator_name(operator: ast.operator) -> Operator:
@@ -89,84 +92,83 @@ def get_boolop_name(boolop: ast.boolop) -> BoolOp:
     return boolop.__class__.__name__
 
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
 
 def eval_op(op: Operator, left: Any, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         return left + right
-    if op == 'Sub':
+    if op == "Sub":
         return left - right
-    if op == 'Mult':
+    if op == "Mult":
         return left * right
-    if op == 'MatMult':
+    if op == "MatMult":
         return left @ right
-    if op == 'Div':
+    if op == "Div":
         return left / right
-    if op == 'Mod':
+    if op == "Mod":
         return left % right
-    if op == 'Pow':
+    if op == "Pow":
         return left**right
-    if op == 'LShift':
+    if op == "LShift":
         return left << right
-    if op == 'RShift':
+    if op == "RShift":
         return left >> right
-    if op == 'BitOr':
+    if op == "BitOr":
         return left | right
-    if op == 'BitXor':
+    if op == "BitXor":
         return left ^ right
-    if op == 'BitAnd':
+    if op == "BitAnd":
         return left & right
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         return left // right
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
 def eval_aug_assign(op: Operator, left: Any, sl: slice, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         left[sl] += right
         return left
-    if op == 'Sub':
+    if op == "Sub":
         left[sl] -= right
         return left
-    if op == 'Mult':
+    if op == "Mult":
         left[sl] *= right
         return left
-    if op == 'MatMult':
+    if op == "MatMult":
         left[sl] @= right
         return left
-    if op == 'Div':
+    if op == "Div":
         left[sl] /= right
         return left
-    if op == 'Mod':
+    if op == "Mod":
         left[sl] %= right
         return left
-    if op == 'Pow':
+    if op == "Pow":
         left[sl] **= right
         return left
-    if op == 'LShift':
+    if op == "LShift":
         left[sl] <<= right
         return left
-    if op == 'RShift':
+    if op == "RShift":
         left[sl] >>= right
         return left
-    if op == 'BitOr':
+    if op == "BitOr":
         left[sl] |= right
         return left
-    if op == 'BitXor':
+    if op == "BitXor":
         left[sl] ^= right
         return left
-    if op == 'BitAnd':
+    if op == "BitAnd":
         left[sl] &= right
         return left
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         left[sl] //= right
         return left
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
-class _empty:
-    ...
+class _empty: ...
 
 
 class BaseBuilder:
@@ -217,12 +219,14 @@ def aug_assign(self, op: Operator, target: Any, aug_value: Any) -> Any:
     def aug_assign_slice(self, op: Operator, target: Any, sl: slice, aug_value: Any):
         eval_aug_assign(op, target, sl, aug_value)
 
-    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any]) -> Any:
-        if op == 'And':
+    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
+        if op == "And":
             return left and right()
-        if op == 'Or':
+        if op == "Or":
             return left or right()
-        raise ValueError(f'Unknown boolop: {op}')
+        if op == "Not":
+            return not left
+        raise ValueError(f"Unknown boolop: {op}")
 
     def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
         return then() if cond else otherwise()
@@ -246,10 +250,21 @@ def override(self, name: str):
         return globals()[name]
 
 
-class DSLMutator(ast.NodeTransformer):
+def _try_eval(node: ast.expr, nonlocals: dict[str, Any], globals: dict[str, Any]) -> Any:
+    try:
+        code = "lambda " + ",".join(nonlocals.keys()) + ": " + ast.unparse(node)
+        return eval(code, globals)(**nonlocals)
+    except Exception:
+        return _empty
 
-    def __init__(self):
+
+class DSLMutator(ast.NodeTransformer):
+    def __init__(self, nonlocals: dict[str, Any], globals: dict[str, Any], filename: str):
         self.tmp_counter = 0
+        self.nonlocals = nonlocals
+        self.globals = globals
+        self.extra_type_hints: dict[str, Any] = {}
+        self.filename = filename
 
     def get_tmp(self) -> str:
         name = f"__{self.tmp_counter}"
@@ -261,19 +276,13 @@ def visit_If(self, node: ast.If):
         br = self.get_tmp()
         if len(node.orelse) == 0:
             return quote(
-                f"for {br} in __tb.ctx_if(cond):\n"
-                f"  for _ in __tb.ctx_then({br}):\n"
-                "    pass\n",
+                f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n",
                 cond=node.test,
                 passes=[node.body],
                 span=node,
             )
         return quote(
-            f"for {br} in __tb.ctx_if(cond):\n"
-            f"  for _ in __tb.ctx_then({br}):\n"
-            f"    pass\n"
-            f"  for _ in __tb.ctx_else({br}):\n"
-            f"    pass\n",
+            f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n  for _ in __tb.ctx_else({br}):\n    pass\n",
             cond=node.test,
             passes=[node.body, node.orelse],
             span=node,
@@ -287,7 +296,7 @@ def _parse_names(self, target: ast.expr):
         if isinstance(target, ast.Name):
             return f"'{target.id}'"
         elif isinstance(target, ast.Tuple):
-            return ("(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)")
+            return "(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)"
         else:
             s = ast.unparse(target)
             raise NotImplementedError(f"Unsupported for target `{s}`")
@@ -300,8 +309,7 @@ def visit_For(self, node: ast.For):
         ast_set_span(var, ast_get_span(node.target))
         stmts = self._emit_assign_target(node.target, var)
         return quote(
-            f"for {tmp} in __tb.ctx_for(range):\n"
-            "  pass\n",
+            f"for {tmp} in __tb.ctx_for(range):\n  pass\n",
             target=node.target,
             range=node.iter,
             passes=[stmts + node.body],
@@ -316,24 +324,15 @@ def visit_Break(self, node: ast.Break):
         node = self.generic_visit(node)
         return quote("if __tb.ctx_break(): break", span=node)
 
-    def _emit_assign_target(self,
-                            target: ast.expr,
-                            rval: ast.expr,
-                            annot: ast.expr = None) -> list[ast.AST]:
+    def _emit_assign_target(self, target: ast.expr, rval: ast.expr, annot: ast.expr = None) -> list[ast.AST]:
         if isinstance(target, ast.Name):
             if annot is None:
-                return quote(
-                    f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
+                return quote(f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
             else:
-                return quote(
-                    f'name = __tb.bind("{target.id}", value, annot)',
-                    name=target,
-                    value=rval,
-                    annot=annot,
-                    span=target)
+                return quote(f'name = __tb.bind("{target.id}", value, annot)', name=target, value=rval, annot=annot, span=target)
         elif isinstance(target, ast.Attribute):
             s = ast.unparse(target)
-            raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+            raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
         elif isinstance(target, ast.Subscript):
             if annot is None:
                 return quote(
@@ -353,7 +352,6 @@ def _emit_assign_target(self,
                     span=target,
                 )
         else:
-
             # flatten nested tuple into a list of (tmp_name, target)
             unpacked = []
 
@@ -371,11 +369,9 @@ def _visit_target(target: ast.expr) -> str:
                     return res
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+                    raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
 
-            unpack_stmt = ast.Assign(
-                targets=[_visit_target(target)],
-                value=quote_expr('__tb.unwrap_value(rval)', rval=rval, span=rval))
+            unpack_stmt = ast.Assign(targets=[_visit_target(target)], value=quote_expr("__tb.unwrap_value(rval)", rval=rval, span=rval))
             ast_set_span(unpack_stmt, ast_get_span(target))
             stmts = [unpack_stmt]
             bind_lvals = []
@@ -383,8 +379,7 @@ def _visit_target(target: ast.expr) -> str:
 
             def flush_binds():
                 if bind_lvals:
-                    stmts.append(
-                        quote1(f'{", ".join(bind_lvals)}, = {", ".join(bind_rvals)},', span=target))
+                    stmts.append(quote1(f"{', '.join(bind_lvals)}, = {', '.join(bind_rvals)},", span=target))
                     bind_lvals.clear()
                     bind_rvals.clear()
 
@@ -414,15 +409,10 @@ def flush_binds():
                     bind_rvals.append(f'__tb.bind("{target.id}", {tmp})')
                 elif isinstance(target, ast.Subscript):
                     flush_binds()
-                    stmts.append(
-                        quote1(
-                            f'__tb.assign_slice(lval, slice, {tmp})',
-                            lval=target.value,
-                            slice=target.slice,
-                            span=target))
+                    stmts.append(quote1(f"__tb.assign_slice(lval, slice, {tmp})", lval=target.value, slice=target.slice, span=target))
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Unsupported target: {s}')
+                    raise NotImplementedError(f"Unsupported target: {s}")
             flush_binds()
             return stmts
 
@@ -447,11 +437,7 @@ def visit_AugAssign(self, node: ast.AugAssign) -> list[ast.AST]:
         target, rval = node.target, node.value
         op = get_operator_name(node.op)
         if isinstance(target, ast.Name):
-            return quote(
-                f"name = __tb.aug_assign('{op}', {target.id}, value)",
-                name=target,
-                value=rval,
-                span=node)
+            return quote(f"name = __tb.aug_assign('{op}', {target.id}, value)", name=target, value=rval, span=node)
         elif isinstance(target, ast.Subscript):
             return quote(
                 f"__tb.aug_assign_slice('{op}', lval, slice, value)",
@@ -465,41 +451,86 @@ def visit_AugAssign(self, node: ast.AugAssign) -> list[ast.AST]:
 
     def visit_AnnAssign(self, node: ast.AnnAssign):
         node = self.generic_visit(node)
-        rval = node.value or quote_expr('__tb.empty', span=node, annot=node)
+        rval = node.value or quote_expr("__tb.empty", span=node, annot=node)
         return self._emit_assign_target(node.target, rval, annot=node.annotation)
 
     def visit_While(self, node):
-        return quote1(
-            "for _ in __tb.ctx_while(lambda: cond):\n  pass",
-            cond=node.test,
-            passes=[node.body],
-            span=node)
+        node = self.generic_visit(node)
+        return quote1("for _ in __tb.ctx_while(lambda: cond):\n  pass", cond=node.test, passes=[node.body], span=node)
 
     def visit_FunctionDef(self, node: ast.FunctionDef):
-        node = self.generic_visit(node)
+        stmts = []
+        arg_names = set()
         all_args = node.args.posonlyargs + node.args.args
         if node.args.vararg is not None:
             all_args += node.args.vararg
         all_args += node.args.kwonlyargs
-        stmts = []
         for arg in all_args:
             name = arg.arg
+            arg_names.add(name)
             if arg.annotation is not None:
                 arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
             else:
                 arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
             arg.annotation = None
             stmts.append(arg_stmt)
+        # trying to find `A: T.Tensor, b: T.float32` like type hints
+        for stmt in node.body:
+            self._parse_arg_annot(stmt, arg_names)
+        node = self.generic_visit(node)
         node.body = stmts + node.body
         node.decorator_list.clear()
+        name = node.name
+        node.args.kwarg = ast.arg(arg="__kwargs")
+        node = SpanAttacher("__tb_fl", "__tb_fn").visit(node)
         return quote1(
-            f"def {node.name}(__tb):\n"
-            "  range = __tb.override('range')\n"
-            "  pass\n"
-            f"  return {node.name}",
+            f"def make_closure({', '.join(self.nonlocals.keys())}):\n"
+            f"  def {name}(__tb):\n"
+            f"    __tb_fl = '{self.filename}'\n"
+            f"    __tb_fn = '{name}'\n"
+            "    range = __tb.override('range')\n"
+            "    pass\n"
+            f"    return {name}\n"
+            f"  return {name}",
             passes=[node],
         )
 
+    def _try_eval(self, node: ast.expr) -> Any:
+        return _try_eval(node, self.nonlocals, self.globals)
+
+    def _parse_arg_annot(self, stmt: ast.stmt, arg_names: set[str]):
+        if not isinstance(stmt, ast.AnnAssign):
+            return
+        if not isinstance(stmt.target, ast.Name):
+            return
+        if stmt.value is not None:
+            return
+        name = stmt.target.id
+        if name not in arg_names:
+            return
+        annot = stmt.annotation
+
+        # case 1: attribute(T, float32)
+        if isinstance(annot, ast.Attribute) and annot.attr in dtypes._all_dtypes:
+            eval_res = self._try_eval(annot)
+            if isinstance(eval_res, dtypes.dtype):
+                self.extra_type_hints[name] = eval_res
+                return
+
+        # case 2: subscript(attribute(T, Tensor), ...) or call(attribute(T, Tensor), ...)
+        inner = None
+        if isinstance(annot, ast.Call) and isinstance(annot.func, ast.Attribute):
+            inner = annot.func
+        if isinstance(annot, ast.Subscript) and isinstance(annot.value, ast.Attribute):
+            inner = annot.value
+        if inner is not None and inner.attr in ["Tensor", "StridedTensor", "ptr"]:
+            eval_res = self._try_eval(inner)
+            from tilelang.language.proxy import TensorProxy, StridedTensorProxy, ptr
+
+            if isinstance(eval_res, (TensorProxy, StridedTensorProxy)) or eval_res is ptr:
+                self.extra_type_hints[name] = ptr
+                return
+
     def visit_BoolOp(self, node: ast.BoolOp):
         node = self.generic_visit(node)
         op_name = get_boolop_name(node.op)
@@ -513,6 +544,12 @@ def visit_BoolOp(self, node: ast.BoolOp):
             )
         return last
 
+    def visit_UnaryOp(self, node: ast.UnaryOp):
+        node = self.generic_visit(node)
+        if isinstance(node.op, ast.Not):
+            return quote_expr("__tb.boolop('Not', operand)", operand=node.operand, span=node)
+        return node
+
     def visit_Compare(self, node: ast.Compare) -> ast.expr:
         node = self.generic_visit(node)
         left = node.left
@@ -524,27 +561,34 @@ def visit_Compare(self, node: ast.Compare) -> ast.expr:
             left = comp
         last = split[-1]
         for i in reversed(range(len(split) - 1)):
-            last = quote_expr(
-                "__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
+            last = quote_expr("__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
         return last
 
     def visit_IfExp(self, node: ast.IfExp) -> ast.Expr:
         node = self.generic_visit(node)
         return quote_expr(
-            '__tb.ifexp(cond, lambda: then, lambda: otherwise)',
-            cond=node.test,
-            then=node.body,
-            otherwise=node.orelse,
-            span=node)
+            "__tb.ifexp(cond, lambda: then, lambda: otherwise)", cond=node.test, then=node.body, otherwise=node.orelse, span=node
+        )
 
     def visit_Return(self, node: ast.Return):
         node = self.generic_visit(node)
         return quote("return __tb.ret(value)", value=node.value, span=node)
 
     def visit_With(self, node: ast.With):
+        is_kernel_ctx = False
+        for expr in node.items:
+            cexpr = expr.context_expr
+            if isinstance(cexpr, ast.Call) and isinstance(cexpr.func, ast.Attribute) and cexpr.func.attr == "Kernel":
+                eval_res = self._try_eval(cexpr.func)
+                from tilelang.language import Kernel
+
+                if eval_res is Kernel:
+                    is_kernel_ctx = True
         node = self.generic_visit(node)
         for expr in node.items:
             expr.context_expr = quote_expr("__tb.ctx_with(e)", e=expr.context_expr, span=expr)
+        if is_kernel_ctx:
+            return [quote1("if __tb.skip_kernel_ctx(): return"), node]
         return node
 
     def visit_Assert(self, node: ast.Assert):
@@ -557,13 +601,41 @@ def visit_Name(self, node: ast.Name):
         return node
 
 
-_P = ParamSpec('_P')
+class SpanAttacher(ast.NodeTransformer):
+    def __init__(self, filename_var: str, func_name_var: str):
+        self.filename_var = filename_var
+        self.func_name_var = func_name_var
+
+    def visit(self, node: ast.AST):
+        node = self.generic_visit(node)
+        if isinstance(node, ast.stmt) and hasattr(node, "lineno"):
+            return quote(f"__tb.set_fileline({self.filename_var}, {node.lineno}, {self.func_name_var})") + [node]
+        return node
+
+
+_P = ParamSpec("_P")
 
 
 @dataclass
 class IRGenerator(Generic[_P, _T]):
     gen: Callable[[BaseBuilder], Callable[_P, _T]]
     source: str
+    extra_type_hints: dict[str, Any] = field(default_factory=dict)
+
+
+def has_internal_prim_func(func: Callable[_P, _T]) -> bool:
+    tree = utils.get_ast(func)
+    nonlocals = utils.get_func_nonlocals(func)
+    for item in ast.walk(tree):
+        if isinstance(item, ast.FunctionDef):
+            decors = item.decorator_list
+            for decor in decors:
+                if isinstance(decor, ast.Attribute) and decor.attr == "prim_func":
+                    from tilelang.language.eager import prim_func
+
+                    if _try_eval(decor, nonlocals, func.__globals__) is prim_func:
+                        return True
+    return False
 
 
 def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
@@ -594,7 +666,29 @@ def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
 
     tree = utils.get_ast(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
-    tree = DSLMutator().visit(tree)
-    fn = utils.get_compiled_object(tree, func.__name__, filename,
-                                   utils.inspect_function_capture(func))
-    return IRGenerator(gen=fn, source=ast.unparse(tree))
+    nonlocals = utils.get_func_nonlocals(func)
+
+    # DSLMutator generates a function named `make_closure`
+    #   it accepts all names inside nonlocal, and returns the mutated function
+    #   this is because we must separate the closure namespace form the global namespace
+    #     if we directly inject closure variables into the global namespace,
+    #     it generates a new `globals` dict, and the dict owns all reference to the original globalns
+    #     which makes memory leak, because the original globalns cannot be freed
+    #     ```py
+    #     a = 123
+    #     def foo():
+    #       x = foo.__globals__ # OK, globals are maintained by python
+    #       x = {**foo.__globals__, } # Not OK: globals are copied, and the original globals cannot be freed
+    #       def bar(): x
+    #       return bar
+    #     ```
+    mut = DSLMutator(nonlocals, func.__globals__, Path(filename).name)
+    tree = mut.visit(tree)
+    make_closure = utils.get_compiled_object(
+        tree,
+        "make_closure",
+        filename,
+        func.__globals__,  # use the original globalns
+    )
+    fn = make_closure(**nonlocals)
+    return IRGenerator(gen=fn, source=ast.unparse(tree), extra_type_hints=mut.extra_type_hints)
diff --git a/tilelang/language/eager/builder.py b/tilelang/language/eager/builder.py
new file mode 100644
index 000000000..ce61711e3
--- /dev/null
+++ b/tilelang/language/eager/builder.py
@@ -0,0 +1,1198 @@
+from __future__ import annotations
+from contextlib import contextmanager, AbstractContextManager
+from dataclasses import dataclass
+import inspect
+import sys
+
+from tilelang.language.kernel import KernelLaunchFrame
+from tvm_ffi.container import Map
+from tvm.ir.base import Span
+from tvm.ir.expr import Range
+from tvm.tir.stmt import BufferRegion
+from tvm.tir.stmt_functor import substitute
+from .ast import BaseBuilder, IRGenerator, eval_op, has_internal_prim_func, mutate
+from .utils import construct_strides
+from tilelang.utils import side_effect
+import tvm
+from tvm.tir import Buffer
+from tvm.script.ir_builder import tir, IRBuilder
+
+from tvm.tir.expr import BufferLoad, CallEffectKind, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
+from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union, Literal, get_origin
+from collections.abc import Hashable
+from collections.abc import Sequence
+
+# Python 3.9 compatibility for ParamSpec and Self
+try:
+    from typing import ParamSpec, Self
+except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
+    from typing_extensions import ParamSpec, Self
+from .. import dtypes as dt
+from . import utils
+from tilelang.jit.exceptions import JITNoBuilderError, EagerJITBuildError
+import threading
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def unwrap_expr(expr) -> PrimExpr | int | float:
+    """
+    unwrap expr and convert it into PrimExpr like
+    """
+    if isinstance(expr, tir.meta_var):
+        expr = expr.value
+    elif isinstance(expr, Ref):
+        return expr.load()
+    elif is_var(expr):
+        expr = tir.BufferLoad(expr, indices=[0])
+    elif isinstance(expr, (EqualOp, NotEqualOp)):
+        expr = expr.asobject()
+    return expr
+
+
+def unwrap_cond(expr):
+    """
+    unwrap expr and convert to bool condition
+    """
+    expr = unwrap_expr(expr)
+    if isinstance(expr, (IntImm, FloatImm, StringImm)):
+        return bool(expr.value)
+    elif isinstance(expr, PrimExpr):
+        return expr
+    elif isinstance(expr, Buffer):
+        raise TypeError(f"Buffer `{expr}` cannot be used as condition directly.")
+    elif isinstance(expr, (int, bool)) or expr is None:
+        return bool(expr)
+    else:
+        logger.warning(
+            f"Python expression `{expr}` is used as condition in TileLang, \nthis is treated as a constant expression. ",
+            stack_info=True,
+            stacklevel=3,
+        )
+        return bool(expr)
+
+
+thread_local_storage = threading.local()
+
+
+class Frame:
+    """
+    Frame are virtual context managers used in frontend only
+    They do not have any runtime representation in the generated TIR.
+    """
+
+    def __enter__(self): ...
+
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+
+class MacroFrame(Frame): ...
+
+
+class ExitedMacroFrame(Frame): ...
+
+
+class BoolOpFrame(Frame): ...
+
+
+class ContinueFrame(Frame): ...
+
+
+class BreakFrame(Frame): ...
+
+
+@dataclass
+class SerialForWithStep:
+    start: PrimExpr
+    stop: PrimExpr
+    step: PrimExpr
+    annotations: dict[str, Any] | None = None
+
+
+@dataclass
+class OutTensor:
+    shape: Sequence[PrimExpr]
+    dtype: dt.dtype
+
+    @property
+    def strides(self):
+        return construct_strides(tuple(self.shape))
+
+
+@dataclass
+class Ref:
+    bufload: BufferLoad
+
+    @property
+    def buffer(self):
+        return self.bufload.buffer
+
+    def store(self, value):
+        tir.buffer_store(self.bufload.buffer, value, self.bufload.indices)
+
+    def load(self):
+        return self.bufload
+
+
+class UnrollForWithStep(SerialForWithStep): ...
+
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+# Use tuple for isinstance checks and typing.Union for annotations/aliases
+ContinueOrBreak = (ContinueFrame, BreakFrame)
+AnyFrame = Union[tir.frame.IRBuilderFrame, Frame]
+
+TIR_CONTROL_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+)
+
+TIR_VAR_SCOPE_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+    MacroFrame,
+    KernelLaunchFrame,
+)
+
+
+def is_var(v: Any) -> bool:
+    return isinstance(v, Buffer) and v.scope() == "local.var"
+
+
+# phase1: eager jit obtain function signature
+# phase2: eager jit elaborate function
+# none: not inside eager jit, i.e. it is lazyjit
+EagerJITStage = Literal["phase1", "phase2", "none"]
+
+
+class Builder(BaseBuilder):
+    def __init__(self):
+        self.frames: list[AnyFrame] = []
+        self.ir_builder = IRBuilder()
+        self.name_inside_frame: dict[str, AnyFrame] = {}
+        self.macro_arg_annot = {}
+        self.out_idx = []
+        self.out_tensor_cnt = 0
+        self.constexpr_var = set()
+        self.eager_jit: EagerJITStage = "none"
+        self.eager_jit_subs: dict[str, PrimExpr] = {}
+        self.current_file = "<unknown>"
+        self.current_line = 0
+        self.current_macro_name = "<unknown-macro>"
+        # stack to record caller fileline, not callee fileline
+        self.macro_fileline_stack: list[tuple[str, int, str]] = []
+
+    @classmethod
+    def current(cls) -> Self:
+        builder = getattr(thread_local_storage, "builder", None)
+        return builder
+
+    @contextmanager
+    def prim_func(self, name):
+        thread_local_storage.builder = self
+        try:
+            with self.ir_builder, self.with_frame(tir.prim_func()):
+                tir.func_name(name)
+                yield
+            if self.eager_jit != "phase1" and len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError("Not all tensor allocated from `T.empty` are returned")
+        finally:
+            del thread_local_storage.builder
+
+    @contextmanager
+    def macro(self, name=None, annotations=None):
+        if self.find_frame_idx(BoolOpFrame) is not None:
+            raise RuntimeError(
+                f"Macro `{name}` is used inside boolean expressions, "
+                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs"
+            )
+        save = self.name_inside_frame, self.macro_arg_annot
+        self.name_inside_frame = {}
+        self.macro_arg_annot = annotations or {}
+        pos = len(self.frames)
+        # here we add a ExitedMacroFrame to preserve the frame stack inside macro
+        # because macro may bind some variable, and return it
+        #
+        # ```py
+        # @T.macro
+        # def foo(x):
+        #    y = x + 1
+        #    return y
+        # @T.prim_func
+        # def bar():
+        #    c = foo(1) # macro generates let y = x + 1
+        #    d = c # d = c should lay inside frame of `let y = x + 1`
+        self.macro_fileline_stack.append((self.current_file, self.current_line, self.current_macro_name))
+        self.frames.append(MacroFrame())
+        yield
+        self.frames[pos] = ExitedMacroFrame()
+        self.macro_fileline_stack.pop()
+        self.name_inside_frame, self.macro_arg_annot = save
+
+    def get(self) -> PrimFunc:
+        return self.ir_builder.get()
+
+    def find_frame_idx(self, frame: type | tuple[type, ...], start=0) -> int | None:
+        for idx in reversed(range(start, len(self.frames))):
+            f = self.frames[idx]
+            if isinstance(f, frame):
+                return idx
+
+    def enter_frame(self, frame: AbstractContextManager[Any]):
+        self.frames.append(frame)
+        return frame.__enter__()
+
+    def check_continue_break(self):
+        idx = self.find_frame_idx(ContinueOrBreak)
+        if idx is not None:
+            logger.warning("Writing code after continue/break may cause undefined behavior in tilelang.", stack_info=True, stacklevel=3)
+
+    @contextmanager
+    def with_frame(self, frame: AbstractContextManager[Any] | None):
+        pop_idx = len(self.frames)
+        yield self.enter_frame(frame)
+        while len(self.frames) > pop_idx:
+            self.frames.pop().__exit__(None, None, None)
+
+    class _has_if_frame: ...
+
+    def ctx_if(self, cond):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(tir.If(cond)):
+                yield self._has_if_frame
+        else:
+            yield cond
+
+    def ctx_then(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Then()):
+                yield
+        else:
+            if val:
+                yield
+
+    def ctx_else(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Else()):
+                yield
+        else:
+            if not val:
+                yield
+
+    def eval(self, val: Any):
+        val = unwrap_expr(val)
+        if val is None:
+            pass
+        elif isinstance(val, tir.frame.IRBuilderFrame):
+            if isinstance(val, tir.frame.ForFrame):
+                logger.warning(
+                    "Evaluating a for frame may cause undefined behavior in tilelang.",
+                    stack_info=True,
+                    stacklevel=1,
+                )
+            self.enter_frame(val)
+        elif isinstance(val, PrimExpr):
+            tir.evaluate(val)
+        elif isinstance(val, (int, bool)):
+            tir.evaluate(tvm.tir.const(val))
+        elif isinstance(val, str):
+            pass
+        elif isinstance(val, tvm.tir.stmt.BufferStore):
+            tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
+        elif isinstance(val, (Buffer, Var)):
+            pass
+        else:
+            logger.warning(f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
+
+    def ctx_for(self, it):
+        self.check_continue_break()
+        it = unwrap_expr(it)
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
+            # Validate and compute the trip count before constructing the frame
+            if isinstance(it.step, (int, IntImm)):
+                step_value = it.step if isinstance(it.step, int) else it.step.value
+                if step_value == 0:
+                    raise ValueError("Invalid stepped serial: step must be non-zero")
+                if step_value > 0:
+                    real_stop = tir.ceildiv(it.stop - it.start, step_value)
+                else:
+                    real_stop = tir.ceildiv(it.start - it.stop, -step_value)
+            else:
+                logger.warning(f"Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang")
+                real_stop = tir.ceildiv(it.stop - it.start, it.step)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding"
+                )
+            with self.with_frame(real_frame) as v:
+                IRBuilder.name("_tmp", v)
+                yield it.start + v * it.step
+        else:
+            if not isinstance(it, tir.frame.ForFrame):
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding"
+                )
+            with self.with_frame(it) as v:
+                yield v
+
+    def ctx_continue(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(ContinueFrame())
+        tir.evaluate(tir.continue_loop())
+
+    def ctx_break(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(BreakFrame())
+        tir.evaluate(tir.break_loop())
+
+    def ctx_while(self, cond):
+        self.check_continue_break()
+        cond_v = cond()
+        cond_v_unwrap = unwrap_cond(cond_v)
+        if not isinstance(cond_v_unwrap, PrimExpr):
+            if cond_v_unwrap:
+                raise RuntimeError(
+                    f"Infinite while loop detected in TileLang\n"
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n"
+                )
+            else:
+                logger.warning(
+                    "While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n",
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n",
+                    stack_info=True,
+                    stacklevel=2,
+                )
+        with self.with_frame(tir.While(cond_v_unwrap)):
+            yield None
+
+    def bind(self, name, value, annot=BaseBuilder.empty):
+        self.check_continue_break()
+
+        # in prim func, before T.match_buffer
+        # user may write some shape size expression like
+        #   ```py
+        #   M = T.const('M')
+        #   M_2 = M * 2
+        #   A = T.match_buffer(A, (M, M_2))
+        #   ```
+        # If not deal properly, M_2 will be treated as a LetStmt, and causes error in match_buffer
+        # here we do a quick check in prim_func_frame, if the value is pure expr, we directly return it
+        if (
+            isinstance(value, PrimExpr)
+            and isinstance(self.frames[-1], tir.frame.PrimFuncFrame)
+            and side_effect(value) <= CallEffectKind.Pure.value
+        ):
+            return value
+
+        locals = self.get_parent_locals()
+
+        # Handle type annotation
+        if value is self.empty:
+            orig_value = locals.get(name, value)
+            if isinstance(annot, Buffer) and annot.scope() == "global":
+                from tilelang.language import match_buffer
+
+                return IRBuilder.name(
+                    name,
+                    match_buffer(
+                        orig_value,
+                        annot.shape,
+                        annot.dtype,
+                        strides=annot.strides,
+                    ),
+                )
+            else:
+                return orig_value
+
+        orig_value = locals.get(name, self.empty)
+
+        # if orig_value is a local.var, we use buffer_store to modify it immutably
+        #   however, if rvalue is not a PrimExpr, such as buffer,
+        #   we should not use buffer_store, and bind it instead
+        #   ```py
+        #   a = tl.alloc_var('float32')  # bind var `a`
+        #   a = tl.alloc_var('float32')  # bind a new var `a_1`
+        #   a = tl.alloc_shared((1,), T.float32) # bind a to new buffer
+        #   b = a                        # get value of var `b = a_1[0]``
+        #   c = tl.alloc_var('float32')  # bind var `c`
+        #   c = a                        # get and assign `c[0] = a_1[0]`
+        #   ```
+        if isinstance(orig_value, Ref) and isinstance(value, (int, float, PrimExpr)):
+            orig_value.store(value)
+            return orig_value
+        if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
+            tir.buffer_store(orig_value, value, 0)
+            return orig_value
+
+        # 2. Quick return for trivil types
+        if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
+            return value
+        if isinstance(value, tir.IntImm) and value.dtype == "int32":
+            return value.value
+        if isinstance(value, (Var, Buffer)):
+            # Bind TVM Var/Buffer names and also record scope so reusing the same
+            # Python name (e.g., loop vars like `i`) across different for-frames
+            # works without triggering out-of-scope errors.
+            IRBuilder.name(name, value)
+            if name != "_":
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                self.name_inside_frame[name] = self.frames[frame]
+            return value
+
+        # 3. Bind immutable tilelang objects
+        res = self.bind_immutable(name, value)
+
+        # 4. Check variable scope and shadowing
+        if name != "_":
+            frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+            assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+            if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
+                logger.warning(
+                    f"Immutable value `{name}` is re-bound. If you want to modify its value, please use T.alloc_var to make it a variable!",
+                    stack_info=True,
+                    stacklevel=2,
+                )
+            self.name_inside_frame[name] = self.frames[frame]
+        return res
+
+    def unwrap_value(self, value):
+        """
+        Unwrap some tilelang objects to get their inner value
+        """
+        value = unwrap_expr(value)
+        # handle bx, by = tl.Kernel(128, 128), rval is frame
+        if isinstance(value, tir.frame.IRBuilderFrame):
+            return self.enter_frame(value)
+        else:
+            return value
+
+    def bind_immutable(self, name, value):
+        """
+        Bind an immutable tilelang objects.
+        The immutability means the result is usually not changed or re-assigned in a python block.
+        """
+        if name == "_":
+            # use _tmp to make the generated tir more readable
+            name = "_tmp"
+        if isinstance(value, tir.meta_var):
+            return value.value
+        elif isinstance(value, tir.frame.IRBuilderFrame):
+            return self.enter_frame(value)
+        elif isinstance(value, OutTensor):
+            arg = tir.arg(
+                name,
+                tir.buffer(
+                    shape=value.shape,
+                    dtype=value.dtype,
+                    strides=value.strides,
+                ),
+            )
+            arg._out_idx = self.out_tensor_cnt
+            self.out_tensor_cnt += 1
+            return arg
+        elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
+            IRBuilder.name(name, value)
+            return value
+        elif isinstance(value, (PrimExpr, BufferRegion)):
+            frame = tir.LetStmt(value)
+            var = frame.var
+            IRBuilder.name(name, var)
+            return self.enter_frame(frame)
+        else:
+            return value
+
+    def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
+        self.check_continue_break()
+        if annot is not self.empty:
+            logger.warning("Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
+        if isinstance(lval, Buffer):
+            tir.buffer_store(lval, value, sl)
+        else:
+            return super().assign_slice(lval, sl, value)
+
+    def aug_assign(self, op, target, aug_value):
+        self.check_continue_break()
+        if isinstance(target, Ref):
+            target.store(eval_op(op, target.bufload, aug_value))
+            return target
+        elif is_var(target):
+            tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
+            return target
+        elif isinstance(target, Buffer):
+            raise RuntimeError(
+                f"Attempting to update buffer `{target}` using augmented assignment.\n"
+                "Please use slice assignment, e.g. `buf[0] += value` instead."
+            )
+        elif isinstance(target, Var):
+            raise RuntimeError(
+                f"Attempting to update immutable variable `{target}` using augmented assignment.\n"
+                "Please use T.alloc_var to create a mutable variable."
+            )
+        else:
+            return super().aug_assign(op, target, aug_value)
+
+    def aug_assign_slice(self, op, target, sl, aug_value):
+        self.check_continue_break()
+        if isinstance(target, Buffer):
+            tir.buffer_store(target, eval_op(op, target[sl], aug_value), sl)
+        else:
+            return super().aug_assign_slice(op, target, sl, aug_value)
+
+    def boolop(self, op, left, right=None):
+        left = unwrap_cond(left)
+        if isinstance(left, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                if op == "And":
+                    return tir.And(left, right())
+                if op == "Or":
+                    return tir.Or(left, right())
+                if op == "Not":
+                    return tir.Not(left)
+            raise RuntimeError(f"Unsupported boolean operator: {op}")
+        else:
+            return super().boolop(op, left, right)
+
+    def ifexp(self, cond, then, otherwise):
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                return tir.if_then_else(cond, then(), otherwise())
+        else:
+            return super().ifexp(cond, then, otherwise)
+
+    def ret(self, value=None):
+        self.check_continue_break()
+        # handle return T.alloc_var()
+        if value is None:
+            value = tuple()
+        elif isinstance(value, tuple):
+            value = tuple(self.unwrap_value(v) for v in value)
+        else:
+            value = self.unwrap_value(value)
+        last_macro = self.find_frame_idx(MacroFrame)
+        if last_macro is not None:
+            frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
+            if frame is not None:
+                raise NotImplementedError(
+                    "In tilelang macro, return from control flow is not supported yet. \n"
+                    "You should allocate a var before the control flow, assign value inside the blocks, \n"
+                    "and return the var after the control flow. i.e.\n"
+                    "```\n"
+                    "@T.macro\n"
+                    "def my_macro(cond):\n"
+                    "    a = T.alloc_var(T.float16)\n"
+                    "    if cond:\n"
+                    "        a = 1.0\n"
+                    "    return a\n"
+                    "```"
+                )
+            return value
+        else:
+            if self.eager_jit == "phase1":
+                return NotImplemented
+            if not isinstance(value, tuple):
+                value = (value,)
+            for v in value:
+                if not isinstance(v, Buffer) or not hasattr(v, "_out_idx"):
+                    raise RuntimeError(f"Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})")
+                # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
+                self.out_idx.append(v._out_idx - self.out_tensor_cnt)
+            if len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError(f"Not all tensor from `T.empty` are returned, only got {value}")
+            return NotImplemented
+
+    def ctx_with(self, ctx):
+        self.check_continue_break()
+        if isinstance(ctx, tir.frame.IRBuilderFrame):
+            return self.with_frame(ctx)
+        else:
+            return super().ctx_with(ctx)
+
+    def assert_expr(self, cond, msg=None):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if msg is None:
+            msg = "Assertion failed"
+        if isinstance(cond, PrimExpr):
+            self.enter_frame(tir.Assert(cond, msg))
+        elif not cond:
+            raise AssertionError(msg)
+
+    def rval(self, name: str, value: Any) -> Any:
+        if name in self.name_inside_frame:
+            frame = self.name_inside_frame[name]
+            if frame not in self.frames:
+                raise RuntimeError(
+                    f"Immutable variable `{name}` is used outside its defining region!\n"
+                    f"variable `{name}` is defined in frame: {frame}, current frames: {self.frames}."
+                )
+        return self.unwrap_value(value)
+
+    def macro_arg(self, name, value):
+        annot_value = self.macro_arg_annot.get(name, None)
+        if annot_value is Var or annot_value is Ref:
+            if annot_value is Var:
+                logger.warning("Use `T.Var` as macro annotations is deprecated, please use `T.Ref`")
+            if isinstance(value, BufferLoad):
+                if is_var(value.buffer):
+                    return value.buffer
+                idx = [self.bind("_", idx) for idx in value.indices]
+                # indices = self.bind(f'_', value.indices)
+                return Ref(BufferLoad(value.buffer, indices=idx))
+            if isinstance(value, BufferRegion):
+                region = [Range(self.bind("_", x.begin), end=self.bind("_", x.end) if x.end is not None else None) for x in value.region]
+                return BufferRegion(value.buffer, region=region)
+            raise ValueError(
+                f"To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})"
+            )
+        elif isinstance(value, (PrimExpr, int, float)):
+            return self.bind(name, value)
+        else:
+            return value
+
+    def prim_func_arg(self, name, value):
+        if isinstance(value, (Buffer, Var)):
+            return tir.arg(name, value)
+        elif value is self.empty:
+            raise ValueError(f"Argument `{name}` is not annotated")
+        elif isinstance(value, Hashable):
+            return value
+        else:
+            raise TypeError(f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+
+    def arg(self, name, value):
+        if self.find_frame_idx(MacroFrame) is not None:
+            return self.macro_arg(name, value)
+        else:
+            return self.prim_func_arg(name, value)
+
+    def override(self, name: str):
+        from tilelang.language import serial
+
+        if name == "range":
+            return serial
+        raise ValueError(f"Unknown override: {name}")
+
+    def constexpr(self, name: str, dtype: str = "int32") -> Var:
+        var = tir.Var(name, dtype)
+        self.constexpr_var.add(var)
+        var.orig_name = name
+        return var
+
+    def set_fileline(self, filename: str, lineno: int, name: str):
+        self.current_file = filename
+        self.current_line = lineno
+        self.current_macro_name = name
+
+    def get_fileline_stack(self, stacklevel=1):
+        stack = self.macro_fileline_stack + [(self.current_file, self.current_line, self.current_macro_name)]
+        return stack[: len(stack) - stacklevel + 1]
+
+    def skip_kernel_ctx(self):
+        return self.eager_jit == "phase1"
+
+
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+
+if TYPE_CHECKING:
+
+    class PrimFunc(Generic[_P, _T], tvm.tir.PrimFunc):
+        params: list[tvm.tir.Var | tvm.tir.Buffer]
+        body: tvm.tir.Stmt
+        ret_type: tvm.ir.Type
+        buffer_map: Map[tvm.tir.Var, tvm.tir.Buffer]
+        attrs: tvm.Attrs | None
+        span: Span | None
+        ir_gen: IRGenerator[_P, _T] | None
+        orig_func: Callable[_P, _T] | None
+        out_idx_override: list[int] | None
+
+else:
+    PrimFunc = tvm.tir.PrimFunc
+
+
+@dataclass
+class Macro(Generic[_P, _T]):
+    name: str
+    orig_func: Callable[_P, _T]
+    ir_gen: IRGenerator[_P, _T]
+    annotations: dict[str, Any]
+
+    @property
+    def source(self) -> str:
+        return self.ir_gen.source
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+        builder = Builder.current()
+        if builder is None:
+            raise JITNoBuilderError("T.macro can only be used inside @tilelang.jit")
+
+        with builder.macro(self.name, self.annotations):
+            res = self.ir_gen.gen(builder)(*args, **kwargs)
+        return res
+
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
+
+def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
+    """
+    Decorator that converts a Python function into a TileLang macro.
+    TileLang macro is very similar to PrimFunc, it can be used in prim_func or another macro.
+    Parameters
+    ----------
+    func : Callable[_P, _T]
+        The Python function to be converted into a macro. This function will be analyzed
+        and transformed into an IR generation function. The function can take any parameters
+        (_P) and return any type (_T).
+    Returns
+    -------
+    Macro[_P, _T]
+        A Macro object that wraps the original function with IR generation capabilities.
+        The returned Macro preserves the original function's signature (parameters _P and
+        return type _T) while adding metaprogramming capabilities.
+    Example:
+    --------
+        >>> @macro
+        ... def my_macro(x: T.int32) -> T.int32:
+        ...    return x ** 2
+        >>> @prim_func
+        ... def my_func(A: T.Tensor((10,), T.int32), B: T.Tensor((10,), T.int32)):
+        ...    with T.Kernel(1) as _:
+        ...        for i in T.serial(10):
+        ...            B[i] = my_macro(A[i])
+    See Also
+    --------
+    Macro : The class that wraps macro functions
+    mutate : The function that transforms Python code into IR generators
+    """
+
+    def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
+        annotations = get_type_hints(func)
+        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
+
+    return impl(func) if func is not None else impl
+
+
+from typing import _eval_type
+import re
+
+
+def get_type_hints(func):
+    annot = getattr(func, "__annotations__", None)
+    if annot is None:
+        raise TypeError(f"Failed to get function type hints, {func} is not a function")
+    hints = {}
+    # Build eval namespaces from function globals plus captured closure variables
+    # This lets annotations reference symbols like `n`, `h`, or dtype vars
+    # defined in the outer scope of a nested function.
+    globalns = func.__globals__
+    # Here we add nonlocals into localns, to capture the parameters declared in the parent function
+    # ```py
+    # def foo():
+    #   n = 128 # n is nonlocal
+    #   def bar(
+    #       A: T.Tensor(n, T.float32) # we add nonlocal in its eval context
+    #   ):
+    #      for i in range(n): ...
+    # ```
+    #
+    # This is incomplete and buggy
+    #   the only bug scenario the function body doesn't use the the parameters
+    #   but such define-no-use scenario is very rare in writing kernels
+    #
+    # ```py
+    # def foo():
+    #   n = 128
+    #   def bar(A: T.Tensor((n,), T.float32)):
+    #     ... # empty function, do not use `n`
+    localns = utils.get_func_nonlocals(func)
+    for name, value in annot.items():
+        if name == "return":
+            continue
+        if isinstance(value, tvm.DataType):
+            hints[name] = value
+            continue
+        if value is None:
+            value = type(None)
+        if isinstance(value, str):
+            # if the annotation is string, is can be: (i) a T.float32 like annotations, (ii) a ForwardRef object
+            # typing doesn't handle (i), it will try to interpret T.float32
+            #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
+            # here we manually interpret it to return T.float32 object
+            try:
+                _, v = value.split(".", maxsplit=1)
+            except ValueError:
+                v = value
+            if v in dt._all_dtypes:
+                try:
+                    hints[name] = eval(value, globalns, localns)
+                    continue
+                except Exception:
+                    pass
+            if sys.version_info >= (3, 10):
+                value = ForwardRef(value, module=func.__module__)
+            else:
+                value = ForwardRef(value, is_argument=True)
+            hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+        else:
+            hints[name] = value
+    return hints
+
+
+def const(name: str, dtype: str = "int32") -> Var | tuple[Var, ...]:
+    """
+    Declare constexpr variables for dynamic tensor dimensions (eager mode only).
+
+    In eager mode, use T.const() to declare shape dimensions that will be
+    inferred from actual tensor arguments at runtime.
+
+    Example::
+
+        @tilelang.jit
+        def kernel(A, B):
+            M, N = T.const("M, N")
+            A: T.Tensor[[M, N], T.float32]
+            ...
+    """
+    builder = Builder.current()
+    # assert builder is not None, "T.const() can only be used inside @tilelang.jit (eager mode)"
+    # assert builder.eager_jit, "T.const() can only be used inside @tilelang.jit (eager mode)"
+    if builder is None or builder.eager_jit == "none":
+        raise JITNoBuilderError("T.const() can only be used inside @tilelang.jit (eager mode)")
+
+    if builder.eager_jit == "phase1":
+        # in stage 1, we create constexpr variables
+        if "," in name:
+            names = re.split(r"\s*,\s*", name)
+            return tuple(builder.constexpr(n, dtype) for n in names)
+        if " " in name:
+            names = re.split(r"\s+", name)
+            return tuple(builder.constexpr(n, dtype) for n in names)
+        else:
+            return builder.constexpr(name, dtype)
+    elif builder.eager_jit == "phase2":
+        # in stage 2, we substitute constexpr variables with actual values
+        if "," in name:
+            names = re.split(r"\s*,\s*", name)
+            return tuple(builder.eager_jit_subs[n] for n in names)
+        if " " in name:
+            names = re.split(r"\s+", name)
+            return tuple(builder.eager_jit_subs[n] for n in names)
+        else:
+            return builder.eager_jit_subs[name]
+
+
+@dataclass
+class TirTemplate(Generic[_P, _T]):
+    """
+    Template for generating TIR PrimFunc with dynamic shape substitution.
+
+    For lazy-style functions, the PrimFunc is used directly without substitution.
+    For eager-style functions, constexpr variables are substituted based on
+    actual tensor shapes at runtime.
+    """
+
+    name: str
+    prim_func: PrimFunc[_P, _T]
+    matcher: dict[Var, tuple[tvm.tir.Var, str, int, str]] | None = None
+    constexprs: set[Var] = None
+    is_lazy_style: bool = False  # True if from lazy-style (returns PrimFunc directly)
+    ir_gen: IRGenerator[_P, _T] | None = None
+
+    @classmethod
+    def create(
+        cls, name: str, prim_func: PrimFunc[_P, _T], constexpr: set[Var], ir_gen: IRGenerator[_P, _T] | None = None
+    ) -> TirTemplate[_P, _T]:
+        matcher = {}
+        for k, v in prim_func.buffer_map.items():
+            for i, s in enumerate(v.shape):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "shape", i, s.name)
+            for i, s in enumerate(v.strides):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "stride", i, s.name)
+        for s in constexpr:
+            if s not in matcher:
+                shapes = {k: v.shape for k, v in prim_func.buffer_map.items()}
+                strides = {k: v.strides for k, v in prim_func.buffer_map.items()}
+                raise RuntimeError(
+                    f"Constexpr variable `{s}` is not used in any buffer shape or stride.\n"
+                    "At least one **DIRECT** usage is required. Please check:\n"
+                    "(1) the variable is not used\n"
+                    f"(2) all uses are indirect, e.g. {s} * 2, {s} * 3. (you can replace them with separate constexpr variables)\n"
+                    f"Buffer shapes: {shapes}\n"
+                    f"Buffer strides: {strides}"
+                )
+        matcher = {k: matcher[k] for k in constexpr}
+        return cls(name=name, prim_func=prim_func, matcher=matcher, constexprs=constexpr, is_lazy_style=False, ir_gen=ir_gen)
+
+    @classmethod
+    def from_lazy_style(cls, name: str, prim_func: PrimFunc[_P, _T]) -> TirTemplate[_P, _T]:
+        """Create template from lazy-style function that returns PrimFunc directly."""
+        return cls(name=name, prim_func=prim_func, is_lazy_style=True)
+
+    def _parse_phase2_key(self, **kwargs):
+        if self.matcher is None:
+            return ()
+        result = []
+        for k, ty, i, name in self.matcher.values():
+            if name in kwargs:
+                result.append(kwargs.get(name))
+            elif k in kwargs:
+                if ty == "shape":
+                    result.append(kwargs[k].shape[i])
+                elif ty == "stride":
+                    v = kwargs[k]
+                    if isinstance(v, Buffer):
+                        result.append(v.strides[i])
+                    else:
+                        result.append(kwargs[k].stride()[i])
+            else:
+                raise ValueError(
+                    f"Cannot find value for constexpr variable `{name}`\n"
+                    f"Please provide it as a keyword argument, e.g. `{name}=<value>`\n"
+                    f"Or provide the corresponding tensor argument `{k}`."
+                )
+        return tuple(result)
+
+    def get_tir(self, tensor_args, given_tensor_args, kwargs):
+        if self.is_lazy_style:
+            return self.prim_func
+        values = self._parse_phase2_key(**given_tensor_args, **kwargs)
+        subs = {name.orig_name: value for name, value in zip(self.matcher, values)}
+        builder = Builder()
+        builder.eager_jit = "phase2"
+        builder.eager_jit_subs = subs
+        with builder.prim_func(self.name):
+            self.ir_gen.gen(builder)(**tensor_args, **kwargs)
+        pf = builder.get()
+        if builder.out_idx:
+            pf.out_idx_override = builder.out_idx
+        return pf
+
+
+@dataclass
+class JITFunc(Generic[_P, _T]):
+    """
+    Internal wrapper for JIT-compiled functions.
+
+    This class handles both lazy and eager execution styles:
+
+    - **lazy style**: Function explicitly returns a PrimFunc. The original function
+      is called directly to obtain the TIR.
+
+    - **eager style**: Function uses the DSL builder pattern with tensor type
+      annotations. The TIR is constructed by tracing the function body through
+      the Builder.
+
+    The style is determined by `_is_lazy_style()` which checks if calling the
+    original function returns a PrimFunc directly.
+    """
+
+    orig_func: Callable[_P, _T]
+    arg_names: list[str]
+    tensor_args: dict[str, Buffer | Var]
+    tensor_args_defaults: dict[str, Any]
+    ir_gen: IRGenerator[_P, _T]
+    mode: Literal["auto", "lazy", "eager"] = "auto"
+
+    def __post_init__(self):
+        # we don't want it to show up in the constructor
+        self.p1_cache: dict[Any, TirTemplate[_P, _T]] = {}
+
+    def _parse_phase1_key(self, *args, **kwargs):
+        kwargs.update({k: v for k, v in zip(self.arg_names, args)})
+        tensor_args = {}
+        for k in self.tensor_args:
+            if k in kwargs:
+                tensor_args[k] = kwargs.pop(k)
+            elif k in self.tensor_args_defaults:
+                tensor_args[k] = self.tensor_args_defaults[k]
+        p1_key = tuple(sorted(kwargs.items()))
+        return p1_key, tensor_args, kwargs
+
+    def _is_lazy_style(self, *args, **kwargs) -> bool:
+        """
+        Check if the function uses lazy style (explicitly returns PrimFunc).
+
+        Lazy style functions define an inner @T.prim_func and return it:
+            @jit
+            def foo(M, N):
+                @T.prim_func
+                def kernel(...): ...
+                return kernel  # <- returns PrimFunc
+
+        Eager style functions use the builder pattern with type annotations:
+            @jit
+            def foo(A, B):
+                A: T.Tensor[...]
+                with T.Kernel(...): ...
+                # no return
+        """
+        if has_internal_prim_func(self.orig_func):
+            return True
+        try:
+            inspect.signature(self.orig_func).bind(*args, **kwargs)
+        except TypeError:
+            return False
+        try:
+            prim_func = self.orig_func(*args, **kwargs)
+            # lazy jit must return PrimFunc
+            if isinstance(prim_func, PrimFunc):
+                p1_key, _, _ = self._parse_phase1_key(*args, **kwargs)
+                self.p1_cache[p1_key] = TirTemplate.from_lazy_style(self.orig_func.__name__, prim_func)
+                return True
+            return False
+        except (JITNoBuilderError, EagerJITBuildError):
+            # In eager mode, we construct AST directly without prim_func,
+            # so there's no Builder available when the function is called.
+            # When eager-only features like T.const() or T.Kernel() are used,
+            # they raise JITNoBuilderError because no Builder exists yet.
+            # This indicates the function is eager-style, not lazy-style.
+            return False
+
+    def _build_tir_template(self, *args, **kwargs) -> TirTemplate[_P, _T]:
+        """Build TIR template based on the execution mode."""
+        if self.mode == "lazy":
+            # lazy: function returns PrimFunc directly
+            return TirTemplate.from_lazy_style(self.orig_func.__name__, self.orig_func(*args, **kwargs))
+        elif self.mode == "eager":
+            # eager: trace function body through Builder to construct TIR
+            builder = Builder()
+            builder.eager_jit = "phase1"
+            with builder.prim_func(self.orig_func.__name__):
+                self.ir_gen.gen(builder)(**self.tensor_args, **kwargs)
+            pf = builder.get()
+            pf.orig_func = self.orig_func
+            if builder.out_idx:
+                pf.out_idx_override = builder.out_idx
+            return TirTemplate.create(self.orig_func.__name__, pf, builder.constexpr_var, self.ir_gen)
+        else:
+            raise ValueError(f"Invalid jit mode: {self.mode}, expected 'lazy' or 'eager'")
+
+    def parse_args(self, *args, **kwargs):
+        """Parse arguments and return cache key and tensor args."""
+        p1_key, tensor_args, kwargs = self._parse_phase1_key(*args, **kwargs)
+        if not tensor_args:
+            return (p1_key, None), kwargs
+        tir_temp = self.p1_cache.get(p1_key, None)
+        if tir_temp is None:
+            # mode should be set by JITImpl before calling parse_args
+            tir_temp = self._build_tir_template(**kwargs)
+            self.p1_cache[p1_key] = tir_temp
+        p2_key = tir_temp._parse_phase2_key(**tensor_args, **kwargs)
+        return (p1_key, p2_key), tensor_args
+
+    def get_tir(self, *args, **kwargs):
+        p1_key, tensor_args, kwargs = self._parse_phase1_key(*args, **kwargs)
+        if p1_key not in self.p1_cache:
+            # in legacy gemm, we use lazy tir template to build the tir
+            self.p1_cache[p1_key] = self._build_tir_template(**kwargs)
+        return self.p1_cache[p1_key].get_tir(self.tensor_args, tensor_args, kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.get_tir(*args, **kwargs)
+
+    def set_mode(self, mode: Literal["lazy", "eager"]):
+        """Set the JIT execution mode (internal use only)."""
+        self.mode = mode
+
+    # Proxy function attributes for compatibility with autotuner and inspect.
+    # These attributes are needed by autotuner to extract closure variables
+    # and generate cache keys.
+    _PROXIED_ATTRS = frozenset({"__closure__", "__code__", "__name__", "__globals__", "__wrapped__"})
+
+    def __getattr__(self, name):
+        if name in JITFunc._PROXIED_ATTRS:
+            if name == "__wrapped__":
+                return self.orig_func
+            return getattr(self.orig_func, name)
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+
+def substitute_primfunc(prim_func, vmap):
+    analyzer = tvm.arith.Analyzer()
+
+    def sub(v):
+        return analyzer.simplify(substitute(v, vmap))
+
+    def substitute_buffer(buf):
+        return tvm.tir.decl_buffer(
+            data=sub(buf.data),
+            shape=[sub(dim) for dim in buf.shape],
+            dtype=buf.dtype,
+            strides=[sub(stride) for stride in buf.strides] if buf.strides else None,
+        )
+
+    return PrimFunc(
+        params=[sub(v) for v in prim_func.params],
+        body=substitute(prim_func.body, vmap),
+        buffer_map={k: substitute_buffer(v) for k, v in prim_func.buffer_map.items()},
+        attrs=prim_func.attrs,
+    )
+
+
+def prim_func(func: Callable[_P, _T] = None, *, eager_jit: bool = False) -> PrimFunc[_P, _T] | JITFunc[_P, _T]:
+    def impl(func: Callable[_P, _T]) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
+        sig = inspect.signature(func)
+        ir_gen = mutate(func)
+        func_annot = get_type_hints(func)
+        annot = {}
+        for param in sig.parameters.values():
+            if param.kind == param.POSITIONAL_ONLY:
+                raise TypeError(f"PrimFunc does not support positional-only parameters: `{param.name}`")
+            if param.name in ir_gen.extra_type_hints:
+                annot[param.name] = ir_gen.extra_type_hints[param.name]
+            elif param.name in func_annot:
+                annot[param.name] = func_annot[param.name]
+        for k in annot:
+            # Call callable annotations (e.g., factory functions) to get the actual type.
+            # Skip typing generics like Optional[int], Union[...], List[...] which are
+            # callable but cannot be instantiated.
+            if not isinstance(annot[k], type) and callable(annot[k]) and get_origin(annot[k]) is None:
+                annot[k] = annot[k]()
+
+        if eager_jit:
+            arg_names = list(sig.parameters.keys())
+            tensor_args = {k: v for k, v in annot.items() if isinstance(v, (Buffer, Var))}
+            tensor_args_defaults = {
+                k: sig.parameters[k].default for k in tensor_args if sig.parameters[k].default is not sig.parameters[k].empty
+            }
+            return JITFunc(func, arg_names, tensor_args, tensor_args_defaults, ir_gen)
+        else:
+            try:
+                builder = Builder()
+                with builder.prim_func(func.__name__):
+                    ir_gen.gen(builder)(**annot)
+                prim_func = builder.get()
+                prim_func.orig_func = func
+                if builder.out_idx:
+                    prim_func.out_idx_override = builder.out_idx
+                return prim_func
+            except Exception as e:
+                logger.fatal(f"Failed to build prim_func from {func.__name__}\nargs={annot}\nsource={ir_gen.source}")
+                raise e
+
+    return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/eager/utils.py
similarity index 71%
rename from tilelang/language/v2/utils.py
rename to tilelang/language/eager/utils.py
index 739ecd1eb..207bd92ad 100644
--- a/tilelang/language/v2/utils.py
+++ b/tilelang/language/eager/utils.py
@@ -4,6 +4,7 @@
 from typing import Any, Callable, Literal
 from tilelang import env
 from hashlib import sha256
+from tvm import tir
 import linecache
 
 
@@ -11,11 +12,12 @@ def disk_compile(source, name):
     cache_dir = env.TILELANG_CACHE_DIR
     if cache_dir is not None:
         import os
+
         save_dir = os.path.join(cache_dir, "py-cache")
         os.makedirs(save_dir, exist_ok=True)
-        hash_sfx = sha256(source.encode('utf-8')).hexdigest()[:8]
+        hash_sfx = sha256(source.encode("utf-8")).hexdigest()[:8]
         path = os.path.join(save_dir, f"{name}.{hash_sfx}.py")
-        with open(path, 'w') as f:
+        with open(path, "w") as f:
             f.write(source)
     linecache.cache[path] = (len(source), None, source.splitlines(), path)
     return compile(source, path, "exec")
@@ -53,54 +55,44 @@ def get_func_nonlocals(func):
     return nonlocal_vars
 
 
-def inspect_function_capture(func: Callable) -> dict[str, Any]:
-    """Capture function non-locals and global variables.
-
-    Parameters
-    ----------
-    func : Callable
-        The function to inspect.
-
-    Returns
-    -------
-    res : Dict[str, Any]
-        The function variables map with non-local or global variables.
-    """
-    captured = {
-        **func.__globals__,  # type: ignore
-        **get_func_nonlocals(func),
-    }
-    return captured
-
-
 def get_ast(func: Callable):
     _, start = inspect.getsourcelines(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
     source = inspect.getsource(func)
     source = _remove_leading_ident(source)
-    source = '\n' * (start - 1) + source
+    source = "\n" * (start - 1) + source
     tree = ast.parse(source, filename=filename)
     return tree
 
 
-CompileMethod = Literal['direct', 'disk']
+CompileMethod = Literal["direct", "disk"]
 
 
-def get_compiled_object(source: str | ast.AST,
-                        name: str,
-                        filename: str = None,
-                        globals: dict[str, Any] = None):
+def get_compiled_object(source: str | ast.AST, name: str, filename: str = None, globals: dict[str, Any] = None):
     if isinstance(source, ast.AST):
         assert filename is not None, "filename must be provided when source is an AST"
     try:
         if isinstance(source, ast.AST):
             ast.fix_missing_locations(source)
-            compiled = compile(source, filename, 'exec')
+            compiled = compile(source, filename, "exec")
         else:
             compiled = disk_compile(source, name)
     except Exception as e:
         source_str = source if isinstance(source, str) else ast.unparse(source)
-        raise RuntimeError(f'Failed to compile source for {name}, Error: {e}:\n{source_str}') from e
+        raise RuntimeError(f"Failed to compile source for {name}, Error: {e}:\n{source_str}") from e
     locs = {}
     exec(compiled, globals, locs)
     return locs[name]
+
+
+def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> tuple[Any, ...]:
+    """Construct row-major strides from shape."""
+    strides = []
+    stride = 1
+    for s in shape[::-1]:
+        strides.append(stride)
+        stride *= s
+        if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
+            raise ValueError("Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+    strides = tuple(reversed(strides))
+    return strides
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index fc511c007..fa722b689 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,16 +1,27 @@
 """The language interface for tl programs."""
-from __future__ import annotations
 
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from __future__ import annotations
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
+from tilelang._typing import BufferLikeType
 
 
 def gemm_sp(
-    A_sparse: tir.Buffer | tir.Var,
-    E: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+    A_sparse: BufferLikeType | tir.Var,
+    E: BufferLikeType | tir.Var,
+    B: BufferLikeType | tir.Var,
+    C: BufferLikeType | tir.Var,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
@@ -24,10 +35,10 @@ def gemm_sp(
     The operation supports various warp policies and accumulation modes.
 
     Args:
-        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix dense values
-        E (Union[tir.Buffer, tir.Var]): First input matrix sparse metadata
-        B (Union[tir.Buffer, tir.Var]): Second input matrix
-        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        A_sparse (Union[BufferLikeType, tir.Var]): First input matrix dense values
+        E (Union[BufferLikeType, tir.Var]): First input matrix sparse metadata
+        B (Union[BufferLikeType, tir.Var]): Second input matrix
+        C (Union[BufferLikeType, tir.Var]): Output matrix for results
         transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
         transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
         policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
@@ -42,14 +53,14 @@ def gemm_sp(
         AssertionError: If the K dimensions of matrices A and B don't match
     """
 
-    def legalize_arguments(arg: tir.Buffer | tir.Var):
+    def legalize_arguments(arg: BufferLikeType | tir.Var):
         """Convert let-bound variables to their corresponding buffers.
 
         Args:
-            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+            arg (Union[BufferLikeType, tir.Var]): Input argument to legalize
 
         Returns:
-            Union[tir.Buffer, tir.Var]: The legalized argument
+            Union[BufferLikeType, tir.Var]: The legalized argument
         """
         if isinstance(arg, tir.Var) and T.has_let_value(arg):
             return T.get_let_value(arg).buffer
@@ -63,17 +74,18 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
     K_A = A_sparse.shape[0] if transpose_A else A_sparse.shape[1]
     K_B = B.shape[1] if transpose_B else B.shape[0]
     assert K_A * 2 == K_B, f"T.gemm_sp K shape check failed: K_A = {K_A}, K_B = {K_B}"
-    Aptr = A_sparse.access_ptr("r")
-    Bptr = B.access_ptr("r")
-    Cptr = C.access_ptr("rw")
-    Eptr = E.access_ptr("r")
+    # Build tl.region descriptors for operands
+    A_arg = to_buffer_region(A_sparse, access_type="r")
+    E_arg = to_buffer_region(E, access_type="r")
+    B_arg = to_buffer_region(B, access_type="r")
+    C_arg = to_buffer_region(C, access_type="rw")
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp"),
-        Aptr,
-        Eptr,
-        Bptr,
-        Cptr,
+        tir.op.Op.get("tl.tileop.gemm_sp"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
         transpose_A,
         transpose_B,
         M,
@@ -84,3 +96,129 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
         k_pack,
         wg_wait,
     )
+
+
+# experimental currently, for fast compilation
+def gemm_sp_v2(
+    A_sparse: BufferLikeType | tir.Var,
+    E: BufferLikeType | tir.Var,
+    B: BufferLikeType | tir.Var,
+    C: BufferLikeType | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    transpose_E: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+):
+    """Perform a General Matrix Multiplication (GEMM) operation.
+
+    This function computes C = A @ B where A and B can optionally be transposed.
+    The operation supports various warp policies and accumulation modes.
+
+    Args:
+        A_sparse (Union[BufferLikeType, tir.Var]): First input matrix, contains only non-zero elements
+        E (Union[BufferLikeType, tir.Var]): The metadata of A_sparse, noted as E
+        B (Union[BufferLikeType, tir.Var]): Second input matrix
+        C (Union[BufferLikeType, tir.Var]): Output matrix for results
+        transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
+        transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
+        policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
+        clear_accum (bool, optional): Whether to clear accumulator before computation. Defaults to False.
+        k_pack (int, optional): Number of k dimensions packed into a single warp. Defaults to 1.
+        wg_wait (int, optional): Warp group wait count. Defaults to 0.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation
+
+    Raises:
+        AssertionError: If the K dimensions of matrices A and B don't match
+    """
+
+    def legalize_arguments(arg: BufferLikeType | tir.Var) -> BufferLikeType:
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[BufferLikeType, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[BufferLikeType, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A_sparse = legalize_arguments(A_sparse)
+    E = legalize_arguments(E)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+
+    A_region = to_buffer_region(A_sparse)
+    E_region = to_buffer_region(E)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_sparse)
+    E_shape = retrieve_shape(E)  # nolint: F841
+    B_shape = retrieve_shape(B)
+    C_shape = retrieve_shape(C)
+
+    A_stride = retrieve_stride(A_sparse)
+    B_stride = retrieve_stride(B)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, (
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, (
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+
+    M, N = C_shape
+    K = 2 * (A_shape[-2] if transpose_A else A_shape[-1])
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_sparse)
+    B_offset = retrieve_offset(B)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    E_arg = buffer_region_to_tile_region(E_region, "r", [r for r in E_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.gemm_sp_py"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        transpose_E,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+    )
diff --git a/tilelang/language/fastmath.py b/tilelang/language/fastmath.py
index 0146f53ac..2842118fb 100644
--- a/tilelang/language/fastmath.py
+++ b/tilelang/language/fastmath.py
@@ -1,7 +1,10 @@
+"""Fast math operations exposed on the TileLang language surface."""
+
 from tvm import tir
+from tvm.tir import PrimExpr
 
 
-def __log(x):
+def __log(x: PrimExpr) -> PrimExpr:
     """Calculate log(x) with fast math
 
     Parameters
@@ -18,7 +21,7 @@ def __log(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log"), x)
 
 
-def __log2(x):
+def __log2(x: PrimExpr) -> PrimExpr:
     """Calculate log2(x) with fast math
 
     Parameters
@@ -35,7 +38,7 @@ def __log2(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log2"), x)
 
 
-def __log10(x):
+def __log10(x: PrimExpr) -> PrimExpr:
     """Calculate log10(x) with fast math
 
     Parameters
@@ -52,7 +55,7 @@ def __log10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log10"), x)
 
 
-def __tan(x):
+def __tan(x: PrimExpr) -> PrimExpr:
     """Calculate tan(x) with fast math
 
     Parameters
@@ -69,7 +72,7 @@ def __tan(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__tan"), x)
 
 
-def __cos(x):
+def __cos(x: PrimExpr) -> PrimExpr:
     """Calculate cos(x) with fast math
 
     Parameters
@@ -86,7 +89,7 @@ def __cos(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__cos"), x)
 
 
-def __sin(x):
+def __sin(x: PrimExpr) -> PrimExpr:
     """Calculate sin(x) with fast math
 
     Parameters
@@ -103,7 +106,7 @@ def __sin(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__sin"), x)
 
 
-def __exp10(x):
+def __exp10(x: PrimExpr) -> PrimExpr:
     """Calculate 10**x with fast math
 
     Parameters
@@ -120,7 +123,7 @@ def __exp10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__exp10"), x)
 
 
-def __exp(x):
+def __exp(x: PrimExpr) -> PrimExpr:
     """Calculate 2**x with fast math
 
     Parameters
diff --git a/tilelang/language/fill.py b/tilelang/language/fill_op.py
similarity index 61%
rename from tilelang/language/fill.py
rename to tilelang/language/fill_op.py
index 74aeb2648..accd92b2e 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill_op.py
@@ -1,17 +1,13 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Fill operations exposed on the TileLang language surface."""
 
+from __future__ import annotations
+from tilelang._typing import BufferLikeType
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
-from tilelang.utils.language import get_buffer_region_from_load
-from tilelang.language.utils import (
-    buffer_to_tile_region,
-    buffer_region_to_tile_region,
-    buffer_load_to_tile_region,
-)
+from tilelang.utils.language import get_buffer_region_from_load, to_buffer_region
 
 
-def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.PrimExpr):
+def fill(buffer: BufferLikeType, value: tir.PrimExpr) -> tir.PrimExpr:
     """Fill a buffer or buffer region with a specified value.
 
     Args:
@@ -25,29 +21,23 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
     if isinstance(buffer, tir.Var) and has_let_value(buffer):
         buffer = get_let_value(buffer)
 
-    # Convert to a tl.region descriptor (PrimExpr) with write access
-    region_call = None
+    # Build tl.region as argument
     if isinstance(buffer, tir.Buffer):
-        region_call = buffer_to_tile_region(buffer, "w")
+        extents = list(buffer.shape)
     elif isinstance(buffer, tir.BufferRegion):
         extents = [r.extent for r in buffer.region]
-        region_call = buffer_region_to_tile_region(buffer, "w", extents)
     elif isinstance(buffer, tir.BufferLoad):
         region = get_buffer_region_from_load(buffer)
         if region is not None:
             extents = [r.extent for r in region.region]
-            region_call = buffer_region_to_tile_region(region, "w", extents)
         else:
-            # Fallback: treat element access as 1-extent per dim
-            region_call = buffer_load_to_tile_region(buffer, "w", [1] * len(buffer.indices))
+            extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
     else:
-        # As-is fallback (rare): pass through for downstream handling
-        region_call = buffer
-
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fill"), region_call, value)
+        extents = []
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"), to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
-def clear(buffer: tir.Buffer | tir.Var):
+def clear(buffer: BufferLikeType) -> tir.PrimExpr:
     """Clear a buffer by filling it with zeros.
 
     Args:
@@ -66,8 +56,7 @@ def clear(buffer: tir.Buffer | tir.Var):
         elif isinstance(buffer_region, tir.BufferLoad):
             region = get_buffer_region_from_load(buffer_region)
             if region is None:
-                raise ValueError(
-                    f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+                raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
             return fill(region, 0)
         else:
             raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index 8e6d59268..7e60f46ee 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,6 +1,6 @@
 """Override the LetFrame to print a message when entering the frame."""
-from __future__ import annotations
 
+from __future__ import annotations
 from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
 from tvm.ir import Range
@@ -30,7 +30,7 @@ def push(self, item):
             item: The frame object to push onto the stack
         """
         self._stack.append(item)
-        if hasattr(item, 'var') and hasattr(item, 'value'):
+        if hasattr(item, "var") and hasattr(item, "value"):
             self._var_value_map[item.var] = item.value
 
     def pop(self):
@@ -44,7 +44,7 @@ def pop(self):
         """
         if self._stack:
             item = self._stack.pop()
-            if hasattr(item, 'var'):
+            if hasattr(item, "var"):
                 self._var_value_map.pop(item.var, None)
             return item
         raise IndexError(f"{self.__class__.__name__} is empty")
@@ -130,8 +130,7 @@ def __enter__(self) -> Var:
                     is_block_load = True
                     break
             if is_block_load:
-                self.value = BufferRegion(self.value.buffer,
-                                          [Range(x.base, x.lanes) for x in indices])
+                self.value = BufferRegion(self.value.buffer, [Range(x.base, x.lanes) for x in indices])
 
         _get_let_stack().push(self)
         return self.var
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
deleted file mode 100644
index 0f01582f0..000000000
--- a/tilelang/language/gemm.py
+++ /dev/null
@@ -1,159 +0,0 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
-
-from tilelang.primitives.gemm.base import GemmWarpPolicy
-import tilelang.language as T
-from tvm import tir
-from tilelang.utils.language import (
-    to_buffer_region,
-    retrieve_shape,
-    retrieve_stride,
-    retrieve_ptr,
-    retrieve_offset,
-    prim_expr_equal,
-)
-from tilelang.env import env as _env
-
-
-def _gemm_impl(
-    op_key: str,
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    clear_accum: bool = False,
-    k_pack: int = 1,
-    wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """Shared GEMM implementation.
-
-    Returns a call_intrin handle for the given op key.
-    """
-
-    def legalize_arguments(arg: tir.Buffer | tir.Var):
-        """Convert let-bound variables to their corresponding buffers.
-
-        Args:
-            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
-
-        Returns:
-            Union[tir.Buffer, tir.Var]: The legalized argument
-        """
-        if isinstance(arg, tir.Var) and T.has_let_value(arg):
-            return T.get_let_value(arg).buffer
-        return arg
-
-    A = legalize_arguments(A)
-    B = legalize_arguments(B)
-    C = legalize_arguments(C)
-    mbar = legalize_arguments(mbar) if mbar is not None else None
-
-    # Normalize A/B/C to BufferRegion to pass into tl.gemm
-    A = to_buffer_region(A)
-    B = to_buffer_region(B)
-    C = to_buffer_region(C)
-
-    A_shape = retrieve_shape(A)
-    B_shape = retrieve_shape(B)
-    C_shape = retrieve_shape(C)
-
-    A_stride = retrieve_stride(A)
-    B_stride = retrieve_stride(B)
-
-    assert len(C_shape) == 2, "current only support C as a 2D tensor"
-    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
-    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
-    if len(A_shape) > 2:
-        for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
-                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
-    if len(B_shape) > 2:
-        for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
-                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
-
-    M, N = C_shape
-    K = A_shape[-2] if transpose_A else A_shape[-1]
-    K_B = B_shape[-1] if transpose_B else B_shape[-2]
-    assert prim_expr_equal(K, K_B), f"T.gemm K shape check failed: K_A = {K}, K_B = {K_B}"
-
-    stride_a = A_stride[-2]
-    stride_b = B_stride[-2]
-
-    A_offset = retrieve_offset(A)
-    B_offset = retrieve_offset(B)
-    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
-    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
-    offset_a = A_offset[-1]
-    offset_b = B_offset[-1]
-
-    mbarptr = retrieve_ptr(mbar, "rw") if mbar is not None else tir.const(0, "uint32")
-    C_coords = [r.min for r in C.region]
-    return tir.call_intrin("handle", tir.op.Op.get(op_key), A, B, C, transpose_A, transpose_B, M, N,
-                           K, policy, clear_accum, stride_a, stride_b, offset_a, offset_b, k_pack,
-                           wg_wait, mbarptr, C_coords[0], C_coords[1])
-
-
-# Public wrappers
-def gemm_v1(
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    clear_accum: bool = False,
-    k_pack: int = 1,
-    wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """GEMM v1: use op tl.gemm."""
-    return _gemm_impl(
-        "tl.gemm",
-        A,
-        B,
-        C,
-        transpose_A,
-        transpose_B,
-        policy,
-        clear_accum,
-        k_pack,
-        wg_wait,
-        mbar,
-    )
-
-
-# experimental currently, for fast compilation
-def gemm_v2(
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    clear_accum: bool = False,
-    k_pack: int = 1,
-    wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """GEMM v2: use op tl.gemm_py."""
-    return _gemm_impl(
-        "tl.gemm_py",
-        A,
-        B,
-        C,
-        transpose_A,
-        transpose_B,
-        policy,
-        clear_accum,
-        k_pack,
-        wg_wait,
-        mbar,
-    )
-
-
-# Default to v2; allow forcing v1 via environment variable
-gemm = gemm_v1 if _env.use_gemm_v1() else gemm_v2
diff --git a/tilelang/language/gemm_op.py b/tilelang/language/gemm_op.py
new file mode 100644
index 000000000..dc02639c6
--- /dev/null
+++ b/tilelang/language/gemm_op.py
@@ -0,0 +1,229 @@
+"""GEMM (General Matrix Multiplication) operators exposed on the TileLang language surface."""
+
+from __future__ import annotations
+
+from tilelang._typing import BufferLikeType, BarrierType
+from tilelang.tileop.base import GemmWarpPolicy
+import tilelang.language as T
+from tvm import tir
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
+from tilelang.env import env as _env
+
+
+def _gemm_impl(
+    op_key: str,
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: BarrierType | None = None,
+) -> tir.PrimExpr:
+    """Shared GEMM implementation.
+
+    Returns a call_intrin handle for the given op key.
+    """
+
+    def legalize_arguments(arg: BufferLikeType | tir.Var) -> BufferLikeType:
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[tir.Buffer, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A = legalize_arguments(A)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+    mbar = legalize_arguments(mbar) if mbar is not None else None
+
+    # Normalize A/B/C to BufferRegion for shape/stride/offset analysis
+    A_region = to_buffer_region(A)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_region)
+    B_shape = retrieve_shape(B_region)
+    C_shape = retrieve_shape(C_region)
+
+    A_stride = retrieve_stride(A_region)
+    B_stride = retrieve_stride(B_region)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, (
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, (
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+
+    M, N = C_shape
+    K = A_shape[-2] if transpose_A else A_shape[-1]
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.gemm K shape check failed: K_A = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_region)
+    B_offset = retrieve_offset(B_region)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    if mbar is not None:
+        assert isinstance(mbar, (tir.Buffer, tir.BufferLoad)), (
+            f"mbar for tcgen5mma must be a tir.Buffer or tir.BufferLoad, but got {type(mbar)}"
+        )
+        mbar = to_buffer_region(mbar, access_type="rw")
+    else:
+        mbar = tir.const(0, T.uint32)
+    C_coords = [r.min for r in C_region.region]
+    # Convert BufferRegion to tl.region calls for arguments
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get(op_key),
+        A_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+        mbar,
+        C_coords[0],
+        C_coords[1],
+    )
+
+
+# Public wrappers
+def gemm_v1(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: BarrierType | None = None,
+) -> tir.PrimExpr:
+    """GEMM v1: use op tl.gemm."""
+    return _gemm_impl(
+        "tl.tileop.gemm",
+        A,
+        B,
+        C,
+        transpose_A,
+        transpose_B,
+        policy,
+        clear_accum,
+        k_pack,
+        wg_wait,
+        mbar,
+    )
+
+
+# experimental currently, for fast compilation
+def gemm_v2(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: BarrierType | None = None,
+) -> tir.PrimExpr:
+    """GEMM v2: use op tl.gemm_py."""
+    return _gemm_impl(
+        "tl.tileop.gemm_py",
+        A,
+        B,
+        C,
+        transpose_A,
+        transpose_B,
+        policy,
+        clear_accum,
+        k_pack,
+        wg_wait,
+        mbar,
+    )
+
+
+# Default to v2; allow forcing v1 via environment variable
+# gemm = gemm_v1 if _env.use_gemm_v1() else gemm_v2
+
+
+def gemm(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: BarrierType | None = None,
+) -> tir.PrimExpr:
+    """TileLang GEMM operator.
+
+    Args:
+        A (BufferLikeType, i.e. Buffer | BufferLoad | BufferRegion, or Var): Input buffer A.
+        B (BufferLikeType): Input buffer B.
+        C (BufferLikeType): Output buffer C.
+        transpose_A (bool): Whether to transpose A. Defaults to False.
+        transpose_B (bool): Whether to transpose B. Defaults to False.
+        policy (GemmWarpPolicy): GEMM warp partition policy.
+        clear_accum (bool): Whether to clear the accumulator.
+        k_pack (int): Numbers of packed matrix cores, for ROCm only. Defaults to 1.
+        wg_wait (int): Int identifier of the warpgroup MMA batch to wait on.. Defaults to 0.
+        mbar (BarrierType, i.e. Buffer | BufferLoad, or Var, optional): Mbarrier in Blackwell. Defaults to None.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation.
+    """
+
+    impl = gemm_v1 if _env.use_gemm_v1() else gemm_v2
+    return impl(A, B, C, transpose_A, transpose_B, policy, clear_accum, k_pack, wg_wait, mbar)
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 54b78d3d9..6400f0676 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,12 +1,13 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Kernel launching language interface in TileLang."""
 
+from __future__ import annotations
 from collections import deque
 from tvm import tir
 from tvm.tir import Var
 from tvm.script.ir_builder.tir.frame import TIRFrame, BlockFrame
 from tvm.ffi import register_object
 from tilelang import _ffi_api
+from tilelang.jit.exceptions import JITNoBuilderError
 import threading
 
 # Ensure single-dimension kernel bindings can be unpacked like iterables.
@@ -108,8 +109,7 @@ def __enter__(self) -> Var | list[Var]:
         _get_current_stack().push(self)
 
         last_block_frame = self.frames[-1]
-        assert isinstance(last_block_frame,
-                          BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
+        assert isinstance(last_block_frame, BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
 
         maybe_cpu = last_block_frame.annotations.get("tilelang.is_cpu_kernel_frame", False)
 
@@ -227,7 +227,7 @@ def num_threads(self) -> int:
 
 
 def Kernel(
-    *blocks: list[tir.PrimExpr],
+    *blocks: int | tir.PrimExpr,
     threads: int | list[int] | tuple | None = None,
     is_cpu: bool = False,
     prelude: str | None = None,
@@ -236,7 +236,7 @@ def Kernel(
 
     Parameters
     ----------
-    blocks : List[int]
+    blocks : int
         A list of extent, can be 1-3 dimension, representing gridDim.(x|y|z)
     threads : int
         A integer representing blockDim.x
@@ -280,6 +280,15 @@ def Kernel(
         with T.Kernel(loop_extent, is_cpu=True) as (i,):
             ...
     """
+    # In eager mode, we construct AST directly without prim_func,
+    # so there must be a Builder available. If not, this function
+    # is being called outside of a JIT/prim_func context.
+    # lazy import to avoid circular import
+    from tilelang.language.eager.builder import Builder
+
+    if Builder.current() is None:
+        raise JITNoBuilderError("T.Kernel() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available.")
+
     attrs: dict = {}
 
     if not is_cpu and threads is None:
@@ -304,56 +313,48 @@ def Kernel(
 
 
 def get_thread_binding(dim: int = 0) -> Var:
-    """Returns the thread binding for the given dimension.
-    """
+    """Returns the thread binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_binding(dim)
 
 
 def get_thread_bindings() -> list[Var]:
-    """Returns all three thread bindings.
-    """
+    """Returns all three thread bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_bindings()
 
 
 def get_block_binding(dim: int = 0) -> Var:
-    """Returns the block binding for the given dimension.
-    """
+    """Returns the block binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_binding(dim)
 
 
 def get_block_bindings() -> list[Var]:
-    """Returns all three block bindings.
-    """
+    """Returns all three block bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_bindings()
 
 
 def get_thread_extent(dim: int = 0) -> int:
-    """Returns the thread extent for the given dimension.
-    """
+    """Returns the thread extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extent(dim)
 
 
 def get_thread_extents() -> list[int]:
-    """Returns all three thread extents.
-    """
+    """Returns all three thread extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extents()
 
 
 def get_block_extent(dim: int = 0) -> int:
-    """Returns the block extent for the given dimension.
-    """
+    """Returns the block extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extent(dim)
 
 
 def get_block_extents() -> list[int]:
-    """Returns all three block extents.
-    """
+    """Returns all three block extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extents()
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index a09088e68..54b9a75f3 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,13 +1,15 @@
-"""The language interface for tl programs."""
+"""Logical operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 from tilelang import language as T
 from tvm.tir import Buffer, BufferRegion, BufferLoad
 from tvm import tir
 from tilelang.utils.language import get_buffer_elems
+from tilelang._typing import BufferLikeType
 
 
-def any_of(buffer: T.Tensor | BufferRegion):
+def any_of(buffer: BufferLikeType) -> tir.PrimExpr:
     """Check if any element in the buffer is true.
 
     Args:
@@ -36,13 +38,12 @@ def any_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
 
-def all_of(buffer: T.Tensor | BufferRegion):
+def all_of(buffer: BufferLikeType) -> tir.PrimExpr:
     """Check if all elements in the buffer are true.
 
     Args:
@@ -71,7 +72,6 @@ def all_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 85f2acd88..64b6efe7b 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -1,15 +1,20 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Loop related language interfaces in TileLang."""
 
+from __future__ import annotations
 from typing import Any
 from tvm import tir
 from tvm.tir import IntImm
 import tvm.script.ir_builder.tir as tb_tir
-from .v2.builder import SerialForWithStep
+from .eager.builder import SerialForWithStep, UnrollForWithStep
 from tilelang import _ffi_api
+from tvm.script.ir_builder.tir import frame
 
 
-def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
+def Parallel(
+    *extents: int | tir.PrimExpr,
+    coalesced_width: int | None = None,
+    loop_layout: Any | None = None,
+) -> frame.ForFrame:
     """Tools to construct nested parallel for loop.
        This can be used to create element-wise tensor expression.
 
@@ -21,6 +26,37 @@ def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
     coalesced_width : Optional[int]
         The coalesced width of the parallel loop.
 
+    loop_layout : Optional[Fragment]
+        A layout annotation for the parallel loop nest, expressed as a
+        ``T.Fragment``. When provided, it is attached as the
+        ``"parallel_loop_layout"`` annotation on the outermost parallel loop.
+        For a k-dimensional ``T.Parallel(...)`` nest, the fragment's
+        ``InputDim`` must equal ``k``.
+
+    Notes on layout constraints
+    ---------------------------
+    TileLang validates parallel loop layout annotations during
+    ``tl.transform.LayoutInference`` with ``ParallelLoopLayoutValidator``.
+    The key constraints are:
+
+    - Every parallel loop must be covered by a layout annotation after
+      layout inference. For a nested parallel nest, this annotation must live
+      on the outermost loop; inner parallel loops must not carry the layout
+      annotation themselves.
+    - For a nest depth of ``k``, the layout must satisfy
+      ``InputDim == k``.
+    - Violations (missing annotation on the outermost loop, annotations on
+      inner loops, or mismatched ``InputDim``) cause a compilation error.
+
+    Rationale: inner loops cannot control/annotate their outer loops, while the
+    outermost loop can manage its inner nest. Therefore the layout is placed on
+    the outermost loop so lowering passes can rewrite the entire region.
+
+    To make this easy, ``T.Parallel`` attaches any provided ``loop_layout``
+    to the outermost generated loop only. If you omit ``loop_layout``, the
+    compiler will try to infer a valid layout and attach it during the
+    LayoutInference pass.
+
     Returns
     -------
     res : frame.ForFrame
@@ -28,7 +64,11 @@ def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
     """
     annotations: dict[str, Any] = {}
     if coalesced_width is not None:
-        annotations.update({"coalesced_width": coalesced_width})
+        annotations["coalesced_width"] = coalesced_width
+    if loop_layout is not None:
+        # Pass through to C++ as the standard parallel loop layout key.
+        # The builder will attach it only on the outermost parallel loop.
+        annotations["parallel_loop_layout"] = loop_layout
     return _ffi_api.Parallel(extents, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
@@ -36,8 +76,8 @@ def Persistent(
     domain: list[tir.PrimExpr],
     wave_size: tir.PrimExpr,
     index: tir.PrimExpr,
-    group_size: tir.PrimExpr | None = 8,
-):
+    group_size: tir.PrimExpr | int | None = 8,
+) -> frame.ForFrame:
     """Tools to construct persistent for loop.
 
     Parameters
@@ -56,13 +96,13 @@ def Persistent(
 
 def Pipelined(
     start: tir.PrimExpr,
-    stop: tir.PrimExpr = None,
+    stop: tir.PrimExpr | None = None,
     num_stages: int = 0,
     order: list[int] | None = None,
     stage: list[int] | None = None,
     sync: list[list[int]] | None = None,
     group: list[list[int]] | None = None,
-):
+) -> frame.ForFrame:
     """Tools to construct pipelined for loop.
 
     Parameters
@@ -94,11 +134,35 @@ def Pipelined(
     return _ffi_api.Pipelined(start, stop, num_stages, order, stage, sync, group)
 
 
-def serial(start: tir.PrimExpr,
-           stop: tir.PrimExpr | None = None,
-           step: tir.PrimExpr | None = None,
-           *,
-           annotations: dict[str, Any] | None = None):
+def serial(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The serial For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
     step_is_one = False
     step_is_one |= isinstance(step, int) and step == 1
     step_is_one |= isinstance(step, IntImm) and step.value == 1
@@ -109,3 +173,135 @@ def serial(start: tir.PrimExpr,
             stop = start
             start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
         return SerialForWithStep(start, stop, step, annotations=annotations)
+
+
+def unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    explicit : bool
+        Whether to explicitly unroll the loop.
+
+    unroll_factor : int
+        The unroll factor of the loop.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    if stop is None:
+        stop = start
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
+
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # check pragma_unroll_explicit must be False
+        if annotations.get("pragma_unroll_explicit", True):
+            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
+        annotations.update({"pragma_unroll_factor": unroll_factor})
+
+    if step is None or step_is_one:
+        return tb_tir.unroll(start, stop, annotations=annotations)
+    else:
+        return UnrollForWithStep(start, stop, step, annotations=annotations)
+
+
+# "Serial" and "Unroll" are aliases of "T.serial" and "T.unroll". We use uppercase to emphasize that they are tile-level loops.
+
+
+def Serial(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """Alias of T.serial."""
+
+    return serial(start, stop, step, annotations=annotations)
+
+
+def Unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """Alias of T.unroll."""
+
+    return unroll(start, stop, step, explicit=explicit, unroll_factor=unroll_factor, annotations=annotations)
+
+
+def vectorized(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The vectorized For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    return tb_tir.vectorized(start, stop, annotations=annotations)
+
+
+def Vectorized(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """Alias of T.vectorized."""
+
+    return vectorized(start, stop, annotations=annotations)
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 39cab27ad..9c4d4e7fe 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -1,15 +1,18 @@
+"""Common math intrinsics exposed on the TileLang language surface."""
+
 from tvm import tir
+from tvm.tir import PrimExpr
 
 
 def _validate_rounding_mode(rounding_mode):
     """Validate that the rounding mode is one of the supported IEEE modes"""
-    valid_modes = {'rn', 'rz', 'ru', 'rd'}
+    valid_modes = {"rn", "rz", "ru", "rd"}
     if isinstance(rounding_mode, str) and rounding_mode in valid_modes:
         return
     raise ValueError(f"Invalid rounding mode '{rounding_mode}'. Must be one of: {valid_modes}")
 
 
-def __log(x):
+def __log(x: PrimExpr) -> PrimExpr:
     """Calculate log(x) with fast math
 
     Parameters
@@ -26,7 +29,7 @@ def __log(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log"), x)
 
 
-def __log2(x):
+def __log2(x: PrimExpr) -> PrimExpr:
     """Calculate log2(x) with fast math
 
     Parameters
@@ -43,7 +46,7 @@ def __log2(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log2"), x)
 
 
-def __log10(x):
+def __log10(x: PrimExpr) -> PrimExpr:
     """Calculate log10(x) with fast math
 
     Parameters
@@ -60,7 +63,7 @@ def __log10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log10"), x)
 
 
-def __tan(x):
+def __tan(x: PrimExpr) -> PrimExpr:
     """Calculate tan(x) with fast math
 
     Parameters
@@ -77,7 +80,7 @@ def __tan(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__tan"), x)
 
 
-def __cos(x):
+def __cos(x: PrimExpr) -> PrimExpr:
     """Calculate cos(x) with fast math
 
     Parameters
@@ -94,7 +97,7 @@ def __cos(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__cos"), x)
 
 
-def __sin(x):
+def __sin(x: PrimExpr) -> PrimExpr:
     """Calculate sin(x) with fast math
 
     Parameters
@@ -111,7 +114,7 @@ def __sin(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__sin"), x)
 
 
-def __exp10(x):
+def __exp10(x: PrimExpr) -> PrimExpr:
     """Calculate 10**x with fast math
 
     Parameters
@@ -128,7 +131,7 @@ def __exp10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__exp10"), x)
 
 
-def __exp(x):
+def __exp(x: PrimExpr) -> PrimExpr:
     """Calculate 2**x with fast math
 
     Parameters
@@ -146,7 +149,7 @@ def __exp(x):
 
 
 # IEEE-compliant operations
-def ieee_add(x, y, rounding_mode="rn"):
+def ieee_add(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant addition with specified rounding mode
 
     Parameters
@@ -172,7 +175,7 @@ def ieee_add(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_add"), x, y, rounding_mode)
 
 
-def ieee_sub(x, y, rounding_mode="rn"):
+def ieee_sub(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant subtraction with specified rounding mode
 
     Parameters
@@ -196,7 +199,7 @@ def ieee_sub(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_sub"), x, y, rounding_mode)
 
 
-def ieee_mul(x, y, rounding_mode="rn"):
+def ieee_mul(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant multiplication with specified rounding mode
 
     Parameters
@@ -220,7 +223,7 @@ def ieee_mul(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_mul"), x, y, rounding_mode)
 
 
-def ieee_fmaf(x, y, z, rounding_mode="rn"):
+def ieee_fmaf(x: PrimExpr, y: PrimExpr, z: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant fused multiply-add with specified rounding mode
 
     Parameters
@@ -247,7 +250,7 @@ def ieee_fmaf(x, y, z, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_fmaf"), x, y, z, rounding_mode)
 
 
-def ieee_frcp(x, rounding_mode="rn"):
+def ieee_frcp(x: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant reciprocal with specified rounding mode
 
     Parameters
@@ -268,7 +271,7 @@ def ieee_frcp(x, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_frcp"), x, rounding_mode)
 
 
-def ieee_fsqrt(x, rounding_mode="rn"):
+def ieee_fsqrt(x: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant square root with specified rounding mode
 
     Parameters
@@ -289,7 +292,7 @@ def ieee_fsqrt(x, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_fsqrt"), x, rounding_mode)
 
 
-def ieee_frsqrt(x):
+def ieee_frsqrt(x: PrimExpr) -> PrimExpr:
     """IEEE-compliant reciprocal square root (round to nearest only)
 
     Parameters
@@ -306,7 +309,7 @@ def ieee_frsqrt(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_frsqrt"), x)
 
 
-def ieee_fdiv(x, y, rounding_mode="rn"):
+def ieee_fdiv(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant division with specified rounding mode
 
     Parameters
diff --git a/tilelang/language/overrides/parser.py b/tilelang/language/overrides/parser.py
index 01d59b607..0b2fcc44f 100644
--- a/tilelang/language/overrides/parser.py
+++ b/tilelang/language/overrides/parser.py
@@ -1,5 +1,4 @@
 """TVMScript parser overrides tailored for TileLang."""
-from __future__ import annotations
 
 from functools import partial
 
@@ -60,8 +59,12 @@ def tilelang_visit_assign(self, node: doc.Assign) -> None:  # pylint: disable=un
             lhs.ctx = load_ctx
             lhs_value = self.eval_expr(lhs)
             lhs.ctx = store_ctx
-            if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                    len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+            if (
+                isinstance(lhs_value, BufferLoad)
+                and lhs_value.buffer.scope() == "local.var"
+                and len(lhs_value.indices) == 1
+                and lhs_value.indices[0] == 0
+            ):
                 T.buffer_store(lhs_value.buffer, rhs, indices=[0])
                 continue
 
@@ -108,8 +111,12 @@ def tilelang_visit_aug_assign(self, node: doc.AugAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
@@ -133,8 +140,12 @@ def tilelang_visit_ann_assign(self, node: doc.AnnAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index aa98cf569..f648dc8c8 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -18,20 +18,23 @@
 # which is part of the TVM project (https://tvm.apache.org/).
 # ruff: noqa
 """The entry point of TVM parser for tir."""
+
 import inspect
 from typing import Callable, Optional, Union
 
 from tvm.ir.base import deprecated
-from tvm.tir import Buffer, PrimFunc
+from tvm import tir
+from tvm.tir import PrimFunc
 
 from ..ast import buffer, ptr
 from tvm.script.parser._core import parse, scan_macro, utils
 from tvm.script.parser.core.parser import Parser, ScriptMacro
 
+from tilelang._typing import ShapeType, DType
+from tilelang.language import dtypes as _dtypes
+
 
-def prim_func(func: Optional[Callable] = None,
-              private: bool = False,
-              check_well_formed=True) -> Union[PrimFunc, Callable]:
+def prim_func(func: Optional[Callable] = None, private: bool = False, check_well_formed=True) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -149,8 +152,7 @@ def _decorator(func: Callable) -> TIRMacro:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 class BufferProxy:
@@ -158,8 +160,8 @@ class BufferProxy:
 
     def __call__(
         self,
-        shape,
-        dtype="float32",
+        shape: ShapeType,
+        dtype: DType = _dtypes.float32,
         data=None,
         strides=None,
         elem_offset=None,
@@ -168,7 +170,7 @@ def __call__(
         offset_factor=0,
         buffer_type="",
         axis_separators=None,
-    ) -> Buffer:
+    ) -> tir.Buffer:
         return buffer(
             shape,
             dtype=dtype,
@@ -183,7 +185,7 @@ def __call__(
         )
 
     @deprecated("T.Tensor[...]", "T.Tensor(...)")
-    def __getitem__(self, keys) -> Buffer:
+    def __getitem__(self, keys) -> tir.Buffer:
         if not isinstance(keys, tuple):
             return self(keys)
         if len(keys) >= 2 and not isinstance(keys[1], str):
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index 43774947e..473da4327 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -17,7 +17,6 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """The tir expression operation registration"""
-from __future__ import annotations
 
 from tvm import tir
 from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
@@ -57,11 +56,9 @@ def _get_type_str(dtype: str):
         return dtype[0:index]
 
     def _auto_broadcast(a, b, op):
-
         if isinstance(a, int):
             if hasattr(b, "dtype"):
-                if (DataType(b.dtype).type_code == DataTypeCode.INT or
-                        DataType(b.dtype).type_code == DataTypeCode.UINT):
+                if DataType(b.dtype).type_code == DataTypeCode.INT or DataType(b.dtype).type_code == DataTypeCode.UINT:
                     a = IntImm(_get_type_str(b.dtype), a)
                 elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
                     a = FloatImm(_get_type_str(b.dtype), a)
@@ -77,8 +74,7 @@ def _auto_broadcast(a, b, op):
 
         assert isinstance(a, tir.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (DataType(a.dtype).type_code == DataTypeCode.INT or
-                    DataType(a.dtype).type_code == DataTypeCode.UINT):
+            if DataType(a.dtype).type_code == DataTypeCode.INT or DataType(a.dtype).type_code == DataTypeCode.UINT:
                 b = IntImm(_get_type_str(a.dtype), b)
             elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
                 b = FloatImm(_get_type_str(a.dtype), b)
@@ -87,10 +83,10 @@ def _auto_broadcast(a, b, op):
 
         if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
             return op(a, b)
-        elif (DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_a = tir.Broadcast(a, DataType(b.dtype).lanes)
             return op(broadcast_a, b)
-        elif (DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_b = tir.Broadcast(b, DataType(a.dtype).lanes)
             return op(a, broadcast_b)
         else:
diff --git a/tilelang/language/parser/parser.py b/tilelang/language/parser/parser.py
index 3aa720d4e..4cac0ad74 100644
--- a/tilelang/language/parser/parser.py
+++ b/tilelang/language/parser/parser.py
@@ -146,8 +146,7 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
         res = value.__enter__()
         IRBuilder.name(var_name, res)
         return res
-    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and
-                                                  not self.var_table.exist(value)):
+    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and not self.var_table.exist(value)):
         IRBuilder.name(var_name, value)
         return value
     else:
@@ -191,8 +190,7 @@ def visit_for(self: Parser, node: doc.For) -> None:
     if not isinstance(for_frame, T.frame.ForFrame):
         self.report_error(
             node.iter,
-            "Expect the for loop to be one of the following: "
-            "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
+            "Expect the for loop to be one of the following: range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
         )
     with self.var_table.with_frame():
         with for_frame as iters:
@@ -361,8 +359,7 @@ def visit_with(self: Parser, node: doc.With) -> None:
         for item in node.items:
             frame = self.eval_expr(item.context_expr)
             if not isinstance(frame, Frame):
-                self.report_error(item.context_expr,
-                                  "Invalid context expression in the with-statement.")
+                self.report_error(item.context_expr, "Invalid context expression in the with-statement.")
             rhs = stack.enter_context(frame)
             if item.optional_vars is not None:
                 self.eval_assign(target=item.optional_vars, source=rhs, bind_value=bind_with_value)
@@ -505,8 +502,7 @@ def visit_if(self: Parser, node: doc.If) -> None:
                 with self.var_table.with_frame():
                     self.visit_body(node.orelse)
         else:
-            self.report_error(node.test,
-                              f"If condition must be a boolean expression, but got {predicate}")
+            self.report_error(node.test, f"If condition must be a boolean expression, but got {predicate}")
 
 
 @dispatch.register(token="tir", type_name="Assert")
diff --git a/tilelang/language/pdl.py b/tilelang/language/pdl.py
new file mode 100644
index 000000000..32ff54041
--- /dev/null
+++ b/tilelang/language/pdl.py
@@ -0,0 +1,21 @@
+from tvm import tir
+
+
+__all__ = [
+    "pdl_trigger",
+    "pdl_sync",
+]
+
+
+def pdl_trigger() -> tir.PrimExpr:
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.pdl_trigger"),
+    )
+
+
+def pdl_sync() -> tir.PrimExpr:
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.pdl_sync"),
+    )
diff --git a/tilelang/language/print.py b/tilelang/language/print_op.py
similarity index 72%
rename from tilelang/language/print.py
rename to tilelang/language/print_op.py
index 08e18f426..8b19047da 100644
--- a/tilelang/language/print.py
+++ b/tilelang/language/print_op.py
@@ -3,6 +3,7 @@
 It includes functionality to print variables, print values in buffers, conditionally execute debug prints and assert.
 """
 
+from tilelang.language.eager.builder import Builder
 from tvm import tir
 from typing import Any
 import tilelang.language as T
@@ -12,7 +13,7 @@
 
 
 @macro
-def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
+def print_var(var: tir.PrimExpr, msg: str = "") -> None:
     """
     Prints the value of a TIR primitive expression (PrimExpr) for debugging purposes.
 
@@ -26,9 +27,7 @@ def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
 
 
 @macro
-def print_var_with_condition(condition: tir.PrimExpr,
-                             var: tir.PrimExpr,
-                             msg: str = "") -> tir.PrimExpr:
+def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> None:
     """
     Conditionally prints a TIR primitive expression (PrimExpr) if a given condition is True.
 
@@ -44,10 +43,7 @@ def print_var_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_global_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_global_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
     """
@@ -55,17 +51,11 @@ def print_global_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
-    else:
-        tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_shared_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -81,15 +71,11 @@ def print_shared_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
-                                         buffer: tir.Buffer,
-                                         elems: int,
-                                         msg: str = "") -> tir.PrimExpr:
+def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -111,10 +97,17 @@ def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_local_buffer_with_condition(condition: tir.PrimExpr,
-                                      buffer: tir.Buffer,
-                                      elems: int,
-                                      msg: str = "") -> tir.PrimExpr:
+def print_msg(msg: str) -> None:
+    """
+    Prints a message string.
+    """
+    assert isinstance(msg, str), "msg must be a string"
+    assert msg != "", "msg must not be empty"
+    tir.call_extern("handle", "debug_print_msg", msg)
+
+
+@macro
+def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -122,16 +115,12 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr,
         condition (tir.PrimExpr): A TIR expression representing the condition to check.
         buffer (tir.Buffer): The buffer whose values need to be printed.
         elems (int): The number of elements in the buffer to print.
-
-    Returns:
-        tir.PrimExpr: The TIR expression for the debug print operation.
     """
     if condition:
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 from tilelang.utils.target import check_cuda_availability
@@ -140,22 +129,34 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr,
 _IS_CUDA_AVAILABLE = check_cuda_availability()
 
 
+def get_stack_str(msg, stacklevel=1):
+    stack = Builder.current().get_fileline_stack(stacklevel)
+    msg = msg + "\n"
+    for fileline, lineno, macro_name in stack:
+        msg += f"  at {fileline}:{lineno} in {macro_name}\n"
+    return msg
+
+
 @macro
-def device_assert(condition: tir.PrimExpr, msg: str = ""):
+def device_assert(condition: tir.PrimExpr, msg: str = "", no_stack_info=False):
     """
     Device-side assert emulation.
     Emits a device-side assert call on CUDA targets when CUDA is available.
     The assert is always enabled and cannot be disabled at runtime.
     """
     if _IS_CUDA_AVAILABLE:
-        if msg == "":
-            T.call_intrin("void", tir.op.Op.get("tl.device_assert"), condition)
+        if no_stack_info:
+            if msg == "":
+                T.call_intrin("void", tir.op.Op.get("tl.device_assert"), condition)
+            else:
+                warnings.warn("Non-empty msg may slightly slow down the kernel", stacklevel=2)
+                T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, msg)
         else:
-            warnings.warn("Non-empty msg may slightly slow down the kernel", stacklevel=2)
-            T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, msg)
+            T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, get_stack_str(msg, stacklevel=2))
 
 
-def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) -> tir.PrimExpr:
+# NOTE(chaofan): T.print is implemented as a macro, so no return
+def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) -> None:
     """
     A generic print function that handles both TIR buffers and primitive expressions.
 
@@ -163,14 +164,11 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
     - If the input is a TIR primitive expression, it prints its value directly.
 
     Parameters:
-        obj (Any): The object to print. It can be either a tir.Buffer or tir.PrimExpr.
+        obj (Any): The object to print. It can be either a tir.Buffer, tir.PrimExpr, or None (for msg-only print).
         msg (str): An optional message to include in the print statement.
         warp_group_id (int): The warp group id to print.
         warp_id (int): The warp id to print.
-        print thread will be warp_group_id * warp_group_size + warp_id.
-
-    Returns:
-        tir.PrimExpr: The TIR expression for the debug print operation.
+        print thread will be warp_group_id * warp_group_size + warp_id
 
     Raises:
         ValueError: If the input object type is unsupported.
@@ -193,7 +191,7 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
             condition = True
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_local_buffer_with_condition(condition, buffer, elems, msg)
+            print_local_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() == "local.fragment":
             # Get the number of elements in the buffer.
             elems = 1
@@ -201,10 +199,10 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_fragment_buffer_with_condition(condition, buffer, elems, msg)
+            print_fragment_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() in {"shared", "shared.dyn"}:
             # Get the number of elements in the buffer.
             elems = 1
@@ -212,17 +210,17 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_shared_buffer_with_condition(condition, buffer, elems, msg)
+            print_shared_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() == "global":
             # Get the number of elements in the buffer.
             elems = 1
             for dim in buffer.shape:
                 elems *= dim
             condition = True
-            return print_global_buffer_with_condition(condition, buffer, elems, msg)
+            print_global_buffer_with_condition(condition, buffer, elems, msg)
         else:
             raise ValueError(f"Unsupported buffer scope: {buffer.scope()}")
 
@@ -230,9 +228,11 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
         if not msg:
             msg = f"expr<{obj}>"
         # Directly print primitive expressions.
-        return print_var(obj, msg)
+        print_var(obj, msg)
+
+    elif obj is None:
+        print_msg(msg)
 
     else:
         # Unsupported object type.
-        raise ValueError(
-            f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
+        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer, tir.PrimExpr, and None.")
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 2c5a372f5..c5936a033 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,14 +1,18 @@
-"""The language interface for tl programs."""
+"""Buffer/Tensor proxy in TileLang."""
 
 from __future__ import annotations
-from typing import Any, SupportsIndex, TYPE_CHECKING
-from collections.abc import Sequence
+
+from typing import Any, TYPE_CHECKING, Generic, TypeVar
+from tilelang._typing import DType, ShapeType
 from typing_extensions import Self
 
 from tvm import tir
 from tvm.tir import Var, PrimExpr
 from tvm.script.ir_builder.tir import buffer, handle, match_buffer
 from tilelang.utils import deprecated
+from tilelang.jit.exceptions import JITNoBuilderError
+
+from tilelang.language import dtypes as _dtypes
 
 
 class BufferProxy:
@@ -18,8 +22,8 @@ class BufferProxy:
     @deprecated("T.Buffer(...)", "T.Tensor(...)")
     def __call__(
         self,
-        shape,
-        dtype="float32",
+        shape: ShapeType,
+        dtype: DType = _dtypes.float32,
         data=None,
         strides=None,
         elem_offset=None,
@@ -51,11 +55,9 @@ def __getitem__(self, keys) -> tir.Buffer:
             return self(keys)
         return self(*keys)  # type: ignore[attr-defined] # pylint: disable=no-member
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -76,14 +78,15 @@ class BaseTensorProxy:
     customizable default values for scope, alignment, and offset factors. It implements
     the core functionality for creating TIR buffers with specific memory configurations.
     """
+
     default_scope = "global"
     default_align = 0
     default_offset_factor = 0
 
     def __call__(
         self,
-        shape,
-        dtype="float32",
+        shape: ShapeType,
+        dtype: DType = "float32",
         data=None,
         strides=None,
         elem_offset=None,
@@ -97,7 +100,6 @@ def __call__(
         scope = scope or self.default_scope
         align = align or self.default_align
         offset_factor = offset_factor or self.default_offset_factor
-
         return buffer(
             shape,
             dtype=dtype,
@@ -114,15 +116,13 @@ def __call__(
     def __getitem__(self, keys) -> tir.Buffer:
         assert isinstance(keys, tuple)
         # Single argument (the shape)
-        if all([type(s) not in (tuple, str, list) for s in keys]):
+        if all([not isinstance(s, (tuple, list, str)) for s in keys]):
             keys = (keys,)
         return self(*keys)
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -151,19 +151,10 @@ def _construct_strides(shape: tuple[Any]):
             strides.append(s)
         return tuple(reversed(strides))
 
-    def __call__(self,
-                 shape: tuple[Any] | PrimExpr | int,
-                 dtype: str = "float32",
-                 data=None,
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: ShapeType | PrimExpr | int, dtype: DType = "float32", data=None, scope=None) -> tir.Buffer:
         if isinstance(shape, (int, PrimExpr)):
             shape = (shape,)
-        return super().__call__(
-            shape,
-            dtype=dtype,
-            strides=TensorProxy._construct_strides(shape),
-            data=data,
-            scope=scope)
+        return super().__call__(shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data, scope=scope)
 
 
 class StridedTensorProxy(BaseTensorProxy):
@@ -172,11 +163,7 @@ class StridedTensorProxy(BaseTensorProxy):
     This class implements the default tensor proxy with global memory scope, with the stride information required.
     """
 
-    def __call__(self,
-                 shape: tuple[Any],
-                 strides: tuple[Any],
-                 dtype: str = "float32",
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: ShapeType, strides: tuple[Any], dtype: DType = "float32", scope=None) -> tir.Buffer:
         if len(shape) != len(strides):
             raise ValueError("Invalid shape/strides' dimensions")
         return super().__call__(shape, dtype=dtype, strides=strides, scope=scope)
@@ -188,6 +175,7 @@ class FragmentBufferProxy(BaseTensorProxy):
     This class represents tensor proxies specifically for local fragment memory,
     typically used in GPU tensor core operations.
     """
+
     default_scope = "local.fragment"
 
 
@@ -197,6 +185,7 @@ class SharedBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for dynamic shared memory,
     commonly used in GPU shared memory operations.
     """
+
     default_scope = "shared.dyn"
 
 
@@ -206,6 +195,7 @@ class LocalBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for local memory scope,
     typically used for temporary computations in GPU kernels.
     """
+
     default_scope = "local"
 
 
@@ -216,20 +206,17 @@ class LocalBufferProxy(BaseTensorProxy):
 if TYPE_CHECKING:
 
     class BaseTensor:
-
         def __class_getitem__(cls, key):
             return cls
 
-        def __getitem__(self, key) -> Any:
-            ...
+        def __getitem__(self, key) -> Any: ...
 
-        def __setitem__(self, key, value) -> None:
-            ...
+        def __setitem__(self, key, value) -> None: ...
 
         def __init__(
             self,
-            shape: Sequence[SupportsIndex],
-            dtype="float32",
+            shape: ShapeType,
+            dtype: DType = "float32",
             data=None,
             strides=None,
             elem_offset=None,
@@ -238,31 +225,26 @@ def __init__(
             offset_factor=None,
             buffer_type="",
             axis_separators=None,
-        ):
-            ...
+        ): ...
 
         @classmethod
-        def from_ptr(cls,
-                     pointer_var: Var,
-                     shape: Sequence[PrimExpr, ...],
-                     dtype: str = "float32",
-                     strides: tuple[PrimExpr, ...] = None) -> Self:
-            ...
+        def from_ptr(
+            cls, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
+        ) -> Self: ...
 
-    class Tensor(BaseTensor):
-        ...
+    class Tensor(BaseTensor): ...
 
-    class StridedTensor(BaseTensor):
-        ...
+    class StridedTensor(BaseTensor): ...
 
-    class FragmentBuffer(BaseTensor):
-        ...
+    class FragmentBuffer(BaseTensor): ...
 
-    class SharedBuffer(BaseTensor):
-        ...
+    class SharedBuffer(BaseTensor): ...
 
-    class LocalBuffer(BaseTensor):
-        ...
+    class LocalBuffer(BaseTensor): ...
+
+    _T = TypeVar("_T")
+
+    class Ref(Generic[_T], tir.Var): ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
     StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
@@ -270,16 +252,15 @@ class LocalBuffer(BaseTensor):
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
 
+    class Ref: ...
 
-def ptr(dtype: str | None = None,
-        storage_scope: str = "global",
-        *,
-        is_size_var: bool = False) -> Var:
+
+def ptr(dtype: DType | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
     ----------
-    dtype: str
+    dtype: DType
         The data type of the pointer.
 
     storage_scope: str
@@ -296,8 +277,9 @@ def ptr(dtype: str | None = None,
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var,
-                shape: tuple[PrimExpr, ...],
-                dtype: str = "float32",
-                strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+def make_tensor(ptr: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None) -> tir.Buffer:
+    from tilelang.language.eager.builder import Builder
+
+    if Builder.current() is None:
+        raise JITNoBuilderError("T.make_tensor() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available.")
     return Tensor.from_ptr(ptr, shape, dtype, strides)
diff --git a/tilelang/language/random.py b/tilelang/language/random.py
new file mode 100644
index 000000000..f0b0b1f87
--- /dev/null
+++ b/tilelang/language/random.py
@@ -0,0 +1,68 @@
+from tvm import tir
+import tilelang.language as T
+
+
+# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview
+def rng_init(seed, seq=None, off=0, generator="curandStatePhilox4_32_10_t") -> tir.PrimExpr:
+    """Initialize CUDA curand random number generator state
+
+    Parameters
+    ----------
+    seed : PrimExpr
+        Random seed value.
+    seq : PrimExpr
+        Sequence number for parallel random number generation.
+    off : PrimExpr
+        Offset number for parallel random number generation.
+    generator : StringImm
+        Set random generator.
+        See https://docs.nvidia.com/cuda/curand/group__DEVICE.html
+
+    Returns
+    -------
+    state : PrimExpr
+        The random number generator state handle.
+    """
+    assert generator in ["curandStateMRG32k3a_t", "curandStatePhilox4_32_10_t", "curandStateXORWOW_t"]
+    seed = tir.convert(seed)
+    if seq is None:
+        bx = T.get_block_binding()
+        ex = T.kernel.get_thread_extent()
+        tx = T.get_thread_binding()
+        id = tx + bx * ex
+        seq = tir.convert(id)
+    else:
+        seq = tir.convert(seq)
+    off = tir.convert(off)
+    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off, generator)
+
+
+def rng_rand() -> tir.PrimExpr:
+    """Generate a 32-bit unsigned random integer
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A 32-bit unsigned random integer.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.rng_rand"))
+
+
+def rng_rand_float(bit=32, dist="uniform") -> tir.PrimExpr:
+    """Generate a random float
+
+    Parameters
+    ----------
+    bit : int = [32, 64]
+        Bitwidth of random float.
+    dist : StringImm = ["uniform", "normal"]
+        Random distribution.
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A random float.
+    """
+    assert bit in [32, 64]
+    assert dist in ["uniform", "normal"]
+    return tir.call_intrin("float" + str(bit), tir.op.Op.get("tl.rng_rand_float"), dist)
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce_op.py
similarity index 52%
rename from tilelang/language/reduce.py
rename to tilelang/language/reduce_op.py
index 3ebfe7558..503886767 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce_op.py
@@ -1,8 +1,11 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Reduce operations exposed on the TileLang language surface."""
 
+from __future__ import annotations
+from typing import Literal
+from tilelang._typing import BufferLikeType
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
+from tilelang.utils.language import to_buffer_region, retrieve_shape, _get_buffer
 from tilelang.utils.language import is_shared, is_fragment
 from tvm.script.ir_builder import IRBuilder
 
@@ -13,7 +16,13 @@ def _legalize_dim(buffer: tir.Buffer, dim: int):
     return dim
 
 
-def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+_REDUCE_OP_KEY = "tl.tileop.reduce"
+
+ReduceKind = Literal["sum", "abssum", "max", "absmax", "min", "bitand", "bitor", "bitxor"]
+
+
+# NOTE(chaofan): T.reduce is implemented as a macro, so no return
+def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: ReduceKind, dim: int, clear: bool) -> None:
     """Perform a reduction operation on a buffer along a specified dimension.
 
     Args:
@@ -22,23 +31,18 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         reduce_type (str): Type of reduction ('max', 'min', 'sum', 'abssum')
         dim (int): Dimension along which to perform reduction
         clear (bool): Whether to initialize the output buffer before reduction
-
-    Returns:
-        tir.Call: Handle to the reduction operation
     """
     # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
-    expected_shapes = [
-        buffer.shape[:dim] + buffer.shape[dim + 1:],
-        buffer.shape[:dim] + [1] + buffer.shape[dim + 1:]
-    ]
+    expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
     if list(out.shape) not in expected_shapes:
-        expected_shapes_str = ' or '.join(map(str, expected_shapes))
+        expected_shapes_str = " or ".join(map(str, expected_shapes))
         raise ValueError(
             f"Invalid reduce output shape, buffer shape is {buffer.shape}, dim is {dim}, "
-            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}")
+            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
+        )
 
     @macro
-    def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+    def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool) -> None:
         if is_shared(buffer) and is_shared(out):
             red_frag_in = alloc_fragment(buffer.shape, buffer.dtype)
             red_frag_out = alloc_fragment(out.shape, out.dtype)
@@ -47,12 +51,15 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
             IRBuilder.name(buffer.name + "_frag", red_frag_in)
             IRBuilder.name(out.name + "_frag", red_frag_out)
 
+            if not clear:
+                copy(out, red_frag_out)
+
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                red_frag_in.access_ptr("r"),
-                red_frag_out.access_ptr("w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -65,9 +72,9 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                red_frag_in.access_ptr("r"),
-                out.access_ptr("w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -76,11 +83,14 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
             red_frag_out = alloc_fragment(out.shape, out.dtype)
             IRBuilder.name(out.name + "_frag", red_frag_out)
 
+            if not clear:
+                copy(out, red_frag_out)
+
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer.access_ptr("r"),
-                red_frag_out.access_ptr("w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -89,9 +99,9 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
         elif is_fragment(buffer) and is_fragment(out):
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
-                buffer.access_ptr("r"),
-                out.access_ptr("w"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -99,10 +109,10 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
         else:
             raise ValueError(f"Invalid buffer scopes: {buffer.scope()} and {out.scope()}")
 
-    return reduce_macro(buffer, out, reduce_type, dim, clear)
+    reduce_macro(buffer, out, reduce_type, dim, clear)
 
 
-def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce max on input buffer, store the result to output buffer
 
     Parameters
@@ -120,10 +130,10 @@ def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
     handle : PrimExpr
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "max", dim, clear)
+    reduce(buffer, out, "max", dim, clear)
 
 
-def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce min on input buffer, store the result to output buffer.
 
     Args:
@@ -136,10 +146,10 @@ def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "min", dim, clear)
+    reduce(buffer, out, "min", dim, clear)
 
 
-def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce sum on input buffer, store the result to output buffer.
 
     Args:
@@ -161,10 +171,10 @@ def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "sum", dim, clear)
+    reduce(buffer, out, "sum", dim, clear)
 
 
-def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1):
+def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1) -> None:
     """Perform reduce absolute sum on input buffer, store the result to output buffer.
 
     Args:
@@ -176,10 +186,10 @@ def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1):
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "abssum", dim, True)
+    reduce(buffer, out, "abssum", dim, True)
 
 
-def reduce_absmax(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_absmax(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce absolute max on input buffer, store the result to output buffer.
 
     Args:
@@ -191,10 +201,10 @@ def reduce_absmax(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "absmax", dim, clear)
+    reduce(buffer, out, "absmax", dim, clear)
 
 
-def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce bitwise-and on input buffer, store the result to output buffer.
 
     Args:
@@ -206,10 +216,10 @@ def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitand", dim, clear)
+    reduce(buffer, out, "bitand", dim, clear)
 
 
-def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce bitwise-or on input buffer, store the result to output buffer.
 
     Args:
@@ -221,10 +231,10 @@ def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitor", dim, clear)
+    reduce(buffer, out, "bitor", dim, clear)
 
 
-def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True) -> None:
     """Perform reduce bitwise-xor on input buffer, store the result to output buffer.
 
     Args:
@@ -236,30 +246,62 @@ def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitxor", dim, clear)
+    reduce(buffer, out, "bitxor", dim, clear)
 
 
 @macro
-def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -> tir.PrimExpr:
-    cumsum_smem = alloc_shared(src.shape, src.dtype, "shared.dyn")
+def cumsum_fragment(
+    src: BufferLikeType,
+    dst: BufferLikeType,
+    dim: int,
+    reverse: bool,
+) -> None:
+    """
+    Compute cumulative sum for fragment buffers by copying to shared memory first.
+
+    This macro handles cumulative sum operations on fragment buffers by first copying
+    the data to shared memory, performing the cumsum operation, and then copying back.
+
+    Args:
+        src: Source buffer (Buffer, BufferRegion, or BufferLoad) containing input data.
+        dst: Destination buffer (Buffer, BufferRegion, or BufferLoad) for output data.
+        dim: Dimension along which to compute cumulative sum.
+        reverse: If True, compute cumulative sum in reverse order.
+    """
+    src_shape = retrieve_shape(src)
+    src_buffer = _get_buffer(src)
+    # Get dtype from the buffer
+    if isinstance(src, tir.Buffer):
+        dtype = src.dtype
+    else:
+        dtype = src_buffer.dtype
+    cumsum_smem = alloc_shared(src_shape, dtype, "shared.dyn")
     copy(src, cumsum_smem)
     tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
-        cumsum_smem.access_ptr("r"),
-        cumsum_smem.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(cumsum_smem, access_type="r"),
+        to_buffer_region(cumsum_smem, access_type="w"),
         dim,
         reverse,
     )
     copy(cumsum_smem, dst)
 
 
-def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse: bool = False):
+# NOTE(chaofan): T.cumsum returns None if it goes to macro implementations
+def cumsum(
+    src: BufferLikeType,
+    dst: BufferLikeType | None = None,
+    dim: int = 0,
+    reverse: bool = False,
+) -> tir.PrimExpr | None:
     """
     Compute the cumulative sum of `src` along `dim`, writing results to `dst`.
 
     Negative `dim` indices are normalized (Python-style). If `dst` is None, the operation is performed in-place into `src`. Raises ValueError when `dim` is out of bounds for `src.shape`. When `src.scope() == "local.fragment"`, this delegates to `cumsum_fragment`; otherwise it emits the `tl.cumsum` intrinsic.
 
+    Supports Buffer, BufferRegion, and BufferLoad inputs, allowing operations on buffer slices/regions.
+
     Examples:
         A 1D inclusive scan that writes the result into a separate shared-memory buffer:
 
@@ -283,33 +325,56 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
         ...         T.cumsum(src=tile, dim=1, reverse=True)
         ...         T.copy(tile, B)
 
+        Operating on a buffer region (slice):
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel_region(InputG_fragment: T.Tensor((128,), "float32"), chunk_size: T.int32):
+        ...     with T.Kernel(1, threads=128):
+        ...         i = T.int32(0)
+        ...         T.cumsum(InputG_fragment[i * chunk_size:(i + 1) * chunk_size], dim=0)
+
     Returns:
         tir.Call: A handle to the emitted cumulative-sum operation.
     """
 
-    shape = src.shape
-    if dim >= len(shape) or dim <= -len(shape):
+    # Get shape from src (supports Buffer, BufferRegion, BufferLoad)
+    shape = retrieve_shape(src)
+    if dim >= len(shape) or dim < -len(shape):
         raise ValueError(f"Dimension {dim} is out of bounds for buffer with shape {shape}")
     if dim < 0:
         dim = len(shape) + dim
 
     if dst is None:
         dst = src
-    if src.scope() == "local.fragment":
-        return cumsum_fragment(src, dst, dim, reverse)
+    else:
+        # Validate that dst shape matches src shape
+        dst_shape = retrieve_shape(dst)
+        if len(dst_shape) != len(shape):
+            raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (rank mismatch)")
+        # Check each dimension matches
+        for i in range(len(shape)):
+            if not tir.analysis.expr_deep_equal(dst_shape[i], shape[i]):
+                raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (dim {i} mismatch)")
+
+    # Check if src is a fragment buffer
+    if is_fragment(src):
+        cumsum_fragment(src, dst, dim, reverse)
+        return
+
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
-        src.access_ptr("r"),
-        dst.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(src, access_type="r"),
+        to_buffer_region(dst, access_type="w"),
         dim,
         reverse,
     )
 
 
-def finalize_reducer(reducer: tir.Buffer):
+def finalize_reducer(reducer: tir.Buffer) -> tir.PrimExpr:
     """
-    Finalize a reducer buffer by emitting the `tl.finalize_reducer` intrinsic.
+    Finalize a reducer buffer by emitting the `tl.tileop.finalize_reducer` intrinsic.
 
     This returns a TVM `tir.Call` handle that finalizes the given reducer using its writable pointer.
     The call does not modify Python objects directly; it produces the low-level intrinsic call used by the IR.
@@ -322,6 +387,86 @@ def finalize_reducer(reducer: tir.Buffer):
     """
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.finalize_reducer"),
-        reducer.access_ptr("w"),
+        tir.op.Op.get("tl.tileop.finalize_reducer"),
+        to_buffer_region(reducer, access_type="w"),
     )
+
+
+def warp_reduce_sum(value: tir.PrimExpr) -> tir.PrimExpr:
+    """Perform warp reduction sum on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the sum of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced sum value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_sum"), value)
+
+
+def warp_reduce_max(value: tir.PrimExpr) -> tir.PrimExpr:
+    """Perform warp reduction max on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the max of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced max value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_max"), value)
+
+
+def warp_reduce_min(value: tir.PrimExpr) -> tir.PrimExpr:
+    """Perform warp reduction min on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the min of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced min value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_min"), value)
+
+
+def warp_reduce_bitand(value: tir.PrimExpr) -> tir.PrimExpr:
+    """Perform warp reduction bitwise-and on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-and of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-and value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitand"), value)
+
+
+def warp_reduce_bitor(value: tir.PrimExpr) -> tir.PrimExpr:
+    """Perform warp reduction bitwise-or on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-or of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-or value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitor"), value)
diff --git a/tilelang/language/symbolics.py b/tilelang/language/symbolics.py
index 928edf82c..517d6e21c 100644
--- a/tilelang/language/symbolics.py
+++ b/tilelang/language/symbolics.py
@@ -1,13 +1,16 @@
 """Symbolic variable helpers exposed on the TileLang language surface."""
 
+from __future__ import annotations
+import re
 from tvm import tir
 
+from tilelang._typing import DType
 from tilelang.utils import deprecated
 
 __all__ = ["dynamic", "symbolic"]
 
 
-def dynamic(name: str, dtype: str = "int32"):
+def dynamic(name: str, dtype: DType = "int32") -> tuple[tir.Var, ...] | tir.Var:
     """
     Create a TIR dynamic symbolic variable.
 
@@ -18,10 +21,16 @@ def dynamic(name: str, dtype: str = "int32"):
     Returns:
         tir.Var: A TIR variable with the given name and dtype for use in TIR/TensorIR kernels.
     """
+    if "," in name:
+        names = re.split(r"\s*,\s*", name)
+        return tuple(tir.Var(n, dtype) for n in names)
+    if " " in name:
+        names = re.split(r"\s+", name)
+        return tuple(tir.Var(n, dtype) for n in names)
     return tir.Var(name, dtype)
 
 
 @deprecated("T.symbolic(...)", "T.dynamic(...)", "v0.1.9")
-def symbolic(name: str, dtype: str = "int32"):
+def symbolic(name: str, dtype: DType = "int32") -> tuple[tir.Var, ...] | tir.Var:
     """Deprecated alias for `T.dynamic`."""
-    return tir.Var(name, dtype)
+    return dynamic(name, dtype)
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index 22702ae43..8d65786e4 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -7,9 +7,7 @@
 from tvm.script.parser._core import parse, scan_macro, utils
 
 
-def prim_func(func: Callable | None = None,
-              private: bool = False,
-              check_well_formed: bool = False) -> PrimFunc | Callable:
+def prim_func(func: Callable | None = None, private: bool = False, check_well_formed: bool = False) -> PrimFunc | Callable:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -91,12 +89,12 @@ def dynamic_capture(A, B):
 
 
         @T.prim_func
-        def use1(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use1(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 static_capture(A, B)    ### Produces B[()] = A[128]
 
         @T.prim_func
-        def use2(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use2(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 dynamic_capture(A, B)   ### Produces B[()] = A[x_value]
         ```
@@ -113,8 +111,7 @@ def _decorator(func: Callable) -> _tir_entry.TIRMacro:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 setattr(macro, "dispatch_token", "tir")  # noqa: B010
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index fc5491ce2..f23816c43 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import tvm.script.ir_builder.tir.ir as _ir
 from tvm.script.ir_builder.tir import frame
 from tvm.tir import PrimExpr
@@ -7,10 +6,7 @@
 import functools
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -32,10 +28,7 @@ def serial(start: PrimExpr,
     return _ir.serial(start=start, stop=stop, annotations=annotations)
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -57,10 +50,7 @@ def parallel(start: PrimExpr,
     return _ir.parallel(start=start, stop=stop, annotations=annotations)
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -82,10 +72,7 @@ def vectorized(start: PrimExpr,
     return _ir.vectorized(start=start, stop=stop, annotations=annotations)
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -162,7 +149,6 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -173,7 +159,6 @@ def wrapped(*args, **kwargs):
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -291,7 +276,9 @@ def wrapped(*args, **kwargs):
 anylist_setitem_call_cpacked = _op_wrapper(_tir_op.anylist_setitem_call_cpacked)
 vscale = _op_wrapper(_tir_op.vscale)
 
-reinterpret = _dtype_forward(_tir_op.reinterpret)
+# reinterpret = _dtype_forward(_tir_op.reinterpret)
+reinterpret = _tir_op.reinterpret
+
 call_extern = _dtype_forward(_tir_op.call_extern)
 call_intrin = _dtype_forward(_tir_op.call_intrin)
 call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
diff --git a/tilelang/language/tir/ir.pyi b/tilelang/language/tir/ir.pyi
new file mode 100644
index 000000000..e32093fab
--- /dev/null
+++ b/tilelang/language/tir/ir.pyi
@@ -0,0 +1,151 @@
+from typing import TypeVar, Literal
+from tilelang._typing import DType
+from tvm.tir.expr import Span, PrimExpr, BufferLoad, Var, IntImm
+
+_T = TypeVar("_T")
+
+def cast(value: _T, dtype: DType, span: Span | None = None) -> _T: ...
+def reinterpret(value: _T, dtype: DType, span: Span | None = None) -> _T: ...
+def abs(x: _T, span: Span | None = None) -> _T: ...
+def acos(x: _T) -> _T: ...
+def acosh(x: _T) -> _T: ...
+def address_of(buffer_load: BufferLoad, span: Span | None = None) -> PrimExpr: ...
+def asin(x: _T) -> _T: ...
+def asinh(x: _T) -> _T: ...
+def atan(x: _T) -> _T: ...
+def atan2(x1: _T, x2: _T) -> _T: ...
+def atanh(x: _T) -> _T: ...
+def bitwise_and(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_not(x: _T, span: Span | None = None) -> _T: ...
+def bitwise_or(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_xor(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def ceil(x: _T, span: Span | None = None) -> _T: ...
+def clz(x: _T) -> _T: ...
+def copysign(x1: _T, x2: _T) -> _T: ...
+def cos(x: _T) -> _T: ...
+def cosh(x: _T) -> _T: ...
+def erf(x: _T) -> _T: ...
+def exp(x: _T) -> _T: ...
+def exp2(x: _T) -> _T: ...
+def exp10(x: _T) -> _T: ...
+def floor(x: _T, span: Span | None = None) -> _T: ...
+def ceildiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
+def floordiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def floormod(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def fmod(x: _T, y: _T) -> _T: ...
+def hypot(x1: _T, x2: _T) -> _T: ...
+def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None = None) -> _T: ...
+def infinity(dtype: DType, span: Span | None = None) -> PrimExpr: ...
+def isfinite(x: _T, span: Span | None = None) -> _T: ...
+def isinf(x: _T, span: Span | None = None) -> _T: ...
+def isnan(x: _T, span: Span | None = None) -> _T: ...
+def isnullptr(x: _T, span: Span | None = None) -> _T: ...
+def ldexp(x1: _T, x2: _T) -> _T: ...
+def likely(cond: _T, span: Span | None = None) -> _T: ...
+def log(x: _T) -> _T: ...
+def log1p(x: _T) -> _T: ...
+def log2(x: _T) -> _T: ...
+def log10(x: _T) -> _T: ...
+def lookup_param(param_name: str, span: Span | None = None) -> PrimExpr: ...
+def max(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def max_value(dtype: DType, span: Span | None = None) -> PrimExpr: ...
+def min(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def min_value(dtype: DType, span: Span | None = None) -> PrimExpr: ...
+def nearbyint(x: _T, span: Span | None = None) -> _T: ...
+def nextafter(x1: _T, x2: _T) -> _T: ...
+def popcount(x: _T) -> _T: ...
+def pow(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def q_multiply_shift(x: _T, y: _T, q: _T, s: _T) -> _T: ...
+def q_multiply_shift_per_axis(
+    x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm
+) -> PrimExpr: ...
+def ret(val: _T) -> _T: ...
+def round(x: _T, span: Span | None = None) -> _T: ...
+def rsqrt(x: _T) -> _T: ...
+def shift_left(x: _T, y: _T, span=None) -> _T: ...
+def shift_right(x: _T, y: _T, span=None) -> _T: ...
+def sigmoid(x: _T) -> _T: ...
+def sin(x: _T) -> _T: ...
+def sinh(x: _T) -> _T: ...
+def sqrt(x: _T) -> _T: ...
+def tan(x: _T) -> _T: ...
+def tanh(x: _T) -> _T: ...
+def trunc(x: _T, span: Span | None = None) -> _T: ...
+def truncdiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def truncmod(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def tvm_access_ptr(ptype: PrimExpr, data, offset: int, extent: int, rw_mask: int) -> PrimExpr: ...
+def tvm_throw_last_error() -> _T: ...
+def tvm_stack_alloca(dtype_str: str, num: int) -> PrimExpr: ...
+def tvm_stack_make_shape(*args) -> _T: ...
+def tvm_stack_make_array(
+    data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset
+) -> PrimExpr: ...
+def tvm_check_return(expected: int, return_unexpected: int, nested_call: PrimExpr) -> PrimExpr: ...
+def call_packed(*args, span=None) -> _T: ...
+def call_cpacked(*args, span=None) -> _T: ...
+def call_packed_lowered(*args, span=None) -> _T: ...
+def call_cpacked_lowered(*args, span=None) -> _T: ...
+def tvm_tuple(*value) -> _T: ...
+def tvm_struct_set(arr, index: int, field: int, value: PrimExpr) -> PrimExpr: ...
+def tvm_thread_invariant(cond: _T) -> _T: ...
+def tvm_thread_allreduce(*freduce_args) -> _T: ...
+def tvm_load_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def tvm_mma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_bmma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_fill_fragment(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, value: PrimExpr) -> PrimExpr: ...
+def tvm_store_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def ptx_wait_group(num: int) -> PrimExpr: ...
+def ptx_commit_group() -> _T: ...
+def ptx_cp_async_barrier(barrier_id: int) -> PrimExpr: ...
+def ptx_init_barrier_thread_count(barrier_id: int, thread_count: int) -> PrimExpr: ...
+def ptx_arrive_barrier(barrier_id: int) -> PrimExpr: ...
+def ptx_arrive_barrier_expect_tx(barrier_id: int, byte_count: int) -> PrimExpr: ...
+def ptx_wait_barrier(barrier_id: int) -> PrimExpr: ...
+def create_barriers(barrier_count: int) -> PrimExpr: ...
+def assume(cond: _T = None) -> _T: ...
+def undef() -> _T: ...
+def TVMBackendAllocWorkspace(device_type: int, device_id: int, nbytes: int, dtype_code_hint: int, dtype_bits_hint: int) -> PrimExpr: ...
+def TVMBackendFreeWorkspace(device_type: int, device_id: int, ptr: Var) -> PrimExpr: ...
+def start_profile_intrinsic(id: int) -> PrimExpr: ...
+def end_profile_intrinsic(id: int) -> PrimExpr: ...
+def anylist_getitem(list_handle, index) -> PrimExpr: ...
+def anylist_resetitem(list_handle, index) -> PrimExpr: ...
+def anylist_setitem_call_packed(list_handle, index, func_name, *args) -> PrimExpr: ...
+def anylist_setitem_call_cpacked(list_handle, index, func_name, *args) -> PrimExpr: ...
+def vscale() -> _T: ...
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index a9ce6a536..8cd93f105 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -7,6 +7,9 @@
 from tvm.tir.expr import IntImm, PrimExprWithOp
 import tvm.tir.op as _tvm_op
 
+from tilelang.language.dtypes import AnyDType
+from tilelang.utils.deprecated import deprecated_warning
+
 
 def call_packed(*args, span=None):
     """Build expression by call an external packed function.
@@ -117,7 +120,7 @@ def call_cpacked_lowered(*args, span=None):
     return _tvm_op.call_cpacked_lowered(*args, span=span)
 
 
-def call_intrin(dtype, func_name, *args, span=None):
+def call_intrin(dtype, func_name, *args, annotations=None, span=None):
     """Build expression by calling an intrinsic function.
 
     Intrinsics can be overloaded with multiple data types via
@@ -142,7 +145,7 @@ def call_intrin(dtype, func_name, *args, span=None):
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.call_intrin(dtype, func_name, *args, span=span)
+    return _tvm_op.call_intrin(dtype, func_name, *args, annotations=annotations, span=span)
 
 
 def call_pure_extern(dtype, func_name, *args, span=None):
@@ -724,8 +727,7 @@ def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     return _tvm_op.tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout)
 
 
-def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                 index_c):
+def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core mma_sync operators
 
     Parameters
@@ -759,12 +761,10 @@ def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                fragment_c, index_c)
+    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
-def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                  index_c):
+def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core bmma_sync operators
 
     Parameters
@@ -798,8 +798,7 @@ def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                 fragment_c, index_c)
+    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
 def tvm_fill_fragment(fragment, m, n, k, index, value):
@@ -1121,7 +1120,6 @@ def ptx_wgmma_rs(
     scale_in_a,
     scale_in_b,
 ):
-
     return call_intrin(
         dtype,
         _tvm_op.Op.get("tl.ptx_wgmma_rs"),
@@ -1168,7 +1166,7 @@ def ptx_tcgen05_mma_ss(
      desc_val, scale_out, mask0, mask1, mask2, mask3[, enable_ws]).
     Aliases: you can also pass `ws` or `warp_specialized` (booleans) instead of `enable_ws`.
     Alternatively, use `variant="ws"` (or "default").
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     # Aliases precedence: if either `ws` or `warp_specialized` is provided, they override enable_ws
@@ -1229,7 +1227,7 @@ def ptx_tcgen05_mma_ts(
     Expects 13 positional arguments:
     (kind_dtype, A_ptr, A_offset, desc_b, B_offset, C_ptr, C_offset,
      desc_val, scale_out, mask0, mask1, mask2, mask3).
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     return call_intrin(
@@ -1345,44 +1343,62 @@ def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, sme
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr,
-                                smem_offset)
+    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset)
 
 
-def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
+def ptx_cp_async(dst_access_ptr, src_access_ptr, bytes, predicate=None):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
 
     Parameters
     ----------
-    dtype : str
-       The data type of the result.
-
-    shared_ptr : Var
-        The shared memory pointer variable.
+    dst_access_ptr : PrimExpr
+        The destination (shared memory) access pointer created by tvm_access_ptr.
+        Should include pointer, offset, extent, and write access flag (rw_mask=2).
 
-    shared_offset : Expr
-        The offset of shared memory pointer.
+    src_access_ptr : PrimExpr
+        The source (global memory) access pointer created by tvm_access_ptr.
+        Should include pointer, offset, extent, and read access flag (rw_mask=1).
 
-    global_ptr : Var
-        The global memory pointer variable.
+    bytes : int or PrimExpr
+        The number of bytes to copy (must be 4, 8, or 16).
 
-    global_offset : Expr
-        The offset of global memory pointer.
-
-    bytes : int
-        The data size to copy.
+    predicate : PrimExpr, optional
+        Optional predicate condition for conditional cp.async. When provided, the copy
+        will only be performed if the predicate evaluates to true. Otherwise, the
+        destination will be filled with zeros (default behavior of cp.async).
 
     Returns
     -------
     call : PrimExpr
         The call expression.
-    """
-    return _tvm_op.ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes)
-
 
-def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes,
-                      barrier_id):
+    Examples
+    --------
+    >>> # Copy 16 bytes from global to shared memory
+    >>> T.ptx_cp_async(
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),  # dst
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),  # src
+    ...     16  # bytes
+    ... )
+    >>>
+    >>> # Predicated cp.async (only copy if condition is true)
+    >>> T.ptx_cp_async(
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),
+    ...     16,
+    ...     predicate=guard  # only copy if guard is true
+    ... )
+    """
+    from tvm import tir
+
+    if predicate is None:
+        return tir.call_intrin("", tir.op.Op.get("tl.ptx_cp_async"), dst_access_ptr, src_access_ptr, bytes)
+    else:
+        return tir.call_intrin("", tir.op.Op.get("tl.ptx_cp_async"), dst_access_ptr, src_access_ptr, bytes, predicate)
+
+
+def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async.bulk
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 
@@ -1414,8 +1430,7 @@ def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offse
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset,
-                                     bytes, barrier_id)
+    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id)
 
 
 def ptx_commit_group():
@@ -2003,17 +2018,19 @@ def infinity(dtype: str, span: Span | None = None) -> Any:
     return call_intrin(dtype, _tvm_op.Op.get("tl.infinity"), dtype, span=span)
 
 
-def reinterpret(dtype, value, span: Span | None = None) -> Any:
-    """infinity value of dtype
+# NOTE(chaofan): Here we use the argument order (value, dtype, ...) instead of (dtype, value, ...) in TVM
+# to be consistent with T.cast.
+def reinterpret(value, dtype, span: Span | None = None) -> Any:
+    """Reinterpret cast a value to dtype.
 
     Parameters
     ----------
-    dtype : str
-        The data type.
-
     value : PrimExpr
         The input value.
 
+    dtype : str
+        The data type.
+
     span : Optional[Span]
         The location of this operator in the source code.
 
@@ -2022,6 +2039,11 @@ def reinterpret(dtype, value, span: Span | None = None) -> Any:
     value : tvm.Expr
         The reinterpret cast value of dtype.
     """
+
+    # NOTE(chaofan): For compatibility, we allow the old API where dtype comes first
+    if isinstance(value, AnyDType):
+        deprecated_warning("T.reinterpret(dtype, value)", "reinterpret(value, dtype)")
+        value, dtype = dtype, value
     return _tvm_op.reinterpret(dtype, value, span)
 
 
@@ -2951,8 +2973,7 @@ def q_multiply_shift_per_axis(
     z : PrimExpr
         The result.
     """
-    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required,
-                                             is_rshift_required)
+    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required, is_rshift_required)
 
 
 def shift_left(x, y, span=None):
@@ -3302,8 +3323,7 @@ def TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dt
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint,
-                                            dtype_bits_hint)
+    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dtype_bits_hint)
 
 
 def TVMBackendFreeWorkspace(device_type, device_id, ptr):
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index caed14aa4..2e1df2d23 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,91 +1,29 @@
+"""Utils in TileLang operators."""
+
 from __future__ import annotations
+
 from tilelang import tvm as tvm
-from tvm import tir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, op
+from tvm import ir, tir
+from tvm.tir import PrimExpr, BufferLoad, op
 from tilelang import language as T
+from tilelang._typing import BufferLikeType, ShapeType
 
 
-def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
-    """
-    Create a tile memory-region descriptor for a BufferLoad.
-
-    Maps access_type ('r', 'w', 'rw') to the numeric codes expected by the `tl.region` intrinsic
-    (1, 2, 3 respectively) and returns a tir.Call representing the region with the provided extents.
-
-    Parameters:
-        buffer (tir.BufferLoad): The BufferLoad that identifies the underlying buffer and indices.
-        access_type (str): One of 'r', 'w', or 'rw' indicating read, write, or read-write access.
-        *args (tir.PrimExpr): Extent expressions for each region dimension.
-
-    Returns:
-        tir.Call: A call to the `tl.region` intrinsic describing the memory region.
-
-    Raises:
-        KeyError: If access_type is not one of 'r', 'w', or 'rw'.
-    """
+def region(buffer: BufferLoad, access_type: str, *args: PrimExpr) -> PrimExpr:
+    """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
-    return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
+    return T.call_intrin("handle", op.Op.get("tl.tileop.region"), buffer, access_type, *args)
 
 
-def buffer_to_tile_region(buffer: Buffer, access_type: str):
-    """Convert a TVM buffer to a tile region descriptor.
-
-    Args:
-        buffer (tir.Buffer): The buffer to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor covering the entire buffer
-    """
-    mins = [0 for _ in buffer.shape]
-    extents = [x for x in buffer.shape]
-    return region(T.BufferLoad(buffer, mins), access_type, *extents)
-
-
-def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-    """Convert a buffer load operation to a tile region descriptor.
-
-    Args:
-        load (tir.BufferLoad): The buffer load operation
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-        extents (List[tir.PrimExpr]): List of expressions defining the region size
-
-    Returns:
-        tir.Call: A region descriptor for the loaded area
-    """
-    indices = load.indices
-
-    if len(indices) > len(extents):
-        # (f"mismatch between indices and extents for buffer load {load}: indices = {indices}, extents = {extents}, "
-        # f"region will be expanded in the last 2 dimensions")
-        new_extents = []
-        for _ in range(len(indices) - len(extents)):
-            new_extents.append(1)
-        for extent in extents:
-            new_extents.append(extent)
-        extents = new_extents
-    assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
-    return region(load, access_type, *extents)
-
-
-def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str,
-                                 extents: list[tir.PrimExpr]):
-    """Convert a buffer region to a tile region descriptor.
-
-    Args:
-        buffer_region (tir.BufferRegion): The buffer region to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor for the specified buffer region
-    """
-    mins = [x.min for x in buffer_region.region]
-    region_extents = [x.extent for x in buffer_region.region]
-    assert len(region_extents) >= len(
-        extents
-    ), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
-
-    return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *region_extents)
+def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]) -> PrimExpr:
+    """Clamp extents and return a tl.region call."""
+    mins = [r.min for r in buffer_region.region]
+    region_extents = [r.extent for r in buffer_region.region]
+    assert len(region_extents) >= len(extents), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
+    clamped_extents = [
+        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i] for i in range(len(region_extents))
+    ]
+    return region(tir.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
 
 
 def index_to_coordinates(index, shape) -> list[PrimExpr]:
@@ -157,3 +95,67 @@ def linear_index(*args: PrimExpr) -> PrimExpr:
     for idx, stride in zip(coords[1:], strides):
         linear = linear * stride + idx
     return linear
+
+
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
+    """
+    Get the buffer region from a buffer load.
+
+    May encounter buffer load like C[0:128, 0:32], ref to pull request
+    for buffer wise op: https://github.com/apache/tvm/pull/14693
+    convert load to region.
+
+    If the buffer load has ramp indices, we will use the ramp's base and lanes to create the region.
+    Otherwise, return None since the load cannot be converted to a region.
+    """
+    buffer, indices = buffer_load.buffer, buffer_load.indices
+    regions = []
+    found_ramp: bool = False
+
+    if extents is not None:
+        assert len(extents) == len(indices), "extents should have the same length as indices"
+    for i, indice in enumerate(indices):
+        if isinstance(indice, tir.Ramp):
+            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
+            regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
+            found_ramp = True
+        elif isinstance(indice, tir.PrimExpr):
+            if extents is not None:
+                regions.append(ir.Range.from_min_extent(indice, extents[i]))
+                found_ramp = True
+            else:
+                regions.append(ir.Range.from_min_extent(indice, 1))
+        else:
+            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
+    if found_ramp:
+        return tir.BufferRegion(buffer, regions)
+    else:
+        # NOTE(chaofan): Or we can return a region with extent 1?
+        return None
+
+
+def get_extent(data: BufferLikeType) -> ShapeType | None:
+    """Return the inferred extent (shape) of a buffer-like object.
+
+    If `data` is a Var bound to a let value, the let value is resolved before inspection.
+
+    Parameters:
+        data: A Var, Buffer, BufferLoad or BufferRegion to inspect.
+
+    Returns:
+        The shape/extents as a list-like of PrimExpr (Buffer.shape or list of region item extents), or None if the extent cannot be determined.
+    """
+
+    if isinstance(data, tir.Var) and T.has_let_value(data):
+        data = T.get_let_value(data)
+    if isinstance(data, tir.Buffer):
+        return data.shape
+    elif isinstance(data, tir.BufferRegion):
+        return [x.extent for x in data.region]
+    elif isinstance(data, tir.BufferLoad):
+        region = get_buffer_region_from_load(data)
+        if region is None:
+            return None
+        return [x.extent for x in region.region]
+    else:
+        return None
diff --git a/tilelang/language/v2/__init__.py b/tilelang/language/v2/__init__.py
deleted file mode 100644
index b86b378ae..000000000
--- a/tilelang/language/v2/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .builder import prim_func, macro, PrimFunc  # noqa: F401
-from .dtypes import *
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
deleted file mode 100644
index 780019c3f..000000000
--- a/tilelang/language/v2/builder.py
+++ /dev/null
@@ -1,721 +0,0 @@
-from __future__ import annotations
-
-from contextlib import contextmanager, AbstractContextManager
-from dataclasses import dataclass
-import inspect
-
-from tilelang.language.kernel import KernelLaunchFrame
-from tvm_ffi.container import Map
-from tvm.ir.base import Span
-from .ast import BaseBuilder, IRGenerator, eval_op, mutate
-import tvm
-from tvm.tir import Buffer
-from tvm.script.ir_builder import tir, IRBuilder
-from tvm.tir.expr import EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
-from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union
-# Python 3.9 compatibility for ParamSpec and Self
-try:
-    from typing import ParamSpec, Self
-except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
-    from typing_extensions import ParamSpec, Self
-from . import dtypes as dt
-import threading
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-def unwrap_expr(expr) -> PrimExpr | int | float:
-    '''
-    unwrap expr and convert it into PrimExpr like
-    '''
-    if isinstance(expr, tir.meta_var):
-        expr = expr.value
-    elif isinstance(expr, Buffer) and expr.scope() == 'local.var':
-        expr = tir.BufferLoad(expr, indices=[0])
-    elif isinstance(expr, (EqualOp, NotEqualOp)):
-        expr = expr.asobject()
-    return expr
-
-
-def unwrap_cond(expr):
-    '''
-    unwrap expr and convert to bool condition
-    '''
-    expr = unwrap_expr(expr)
-    if isinstance(expr, (IntImm, FloatImm, StringImm)):
-        return bool(expr.value)
-    elif isinstance(expr, PrimExpr):
-        return expr
-    elif isinstance(expr, Buffer):
-        raise TypeError(f"Buffer `{expr}` cannot be used as condition directly.")
-    elif isinstance(expr, (int, bool)) or expr is None:
-        return bool(expr)
-    else:
-        logger.warning(
-            f"Python expression `{expr}` is used as condition in TileLang, \n"
-            "this is treated as a constant expression. ",
-            stack_info=True,
-            stacklevel=3)
-        return bool(expr)
-
-
-thread_local_storage = threading.local()
-
-
-class Frame:
-    '''
-    Frame are virtual context managers used in frontend only
-    They do not have any runtime representation in the generated TIR.
-    '''
-
-    def __enter__(self):
-        ...
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        ...
-
-
-class MacroFrame(Frame):
-    ...
-
-
-class BoolOpFrame(Frame):
-    ...
-
-
-class ConstIfFrame(Frame):
-    ...
-
-
-class BlockFrame(Frame):
-    ...
-
-
-class ContinueFrame(Frame):
-    ...
-
-
-class BreakFrame(Frame):
-    ...
-
-
-@dataclass
-class SerialForWithStep:
-    start: PrimExpr
-    stop: PrimExpr
-    step: PrimExpr
-    annotations: dict[str, Any] | None = None
-
-
-# Python 3.9 compatibility: avoid PEP 604 unions at runtime
-# Use tuple for isinstance checks and typing.Union for annotations/aliases
-ContinueOrBreak = (ContinueFrame, BreakFrame)
-AnyFrame = Union[tir.frame.IRBuilderFrame, Frame]
-
-TIR_CONTROL_FRAME = (
-    tir.frame.WhileFrame,
-    tir.frame.ForFrame,
-    tir.frame.IfFrame,
-    tir.frame.PrimFuncFrame,
-)
-
-TIR_VAR_SCOPE_FRAME = (
-    tir.frame.WhileFrame,
-    tir.frame.ForFrame,
-    tir.frame.IfFrame,
-    tir.frame.PrimFuncFrame,
-    MacroFrame,
-    KernelLaunchFrame,
-)
-
-
-def is_var(v: Any) -> bool:
-    return isinstance(v, Buffer) and v.scope() == 'local.var'
-
-
-class Builder(BaseBuilder):
-
-    def __init__(self):
-        self.frames: list[AnyFrame] = []
-        self.ir_builder = IRBuilder()
-        self.name_inside_frame: dict[str, AnyFrame] = {}
-
-    @classmethod
-    def current(cls) -> Self:
-        builder = thread_local_storage.builder
-        assert builder is not None, "No active Builder found in the current thread."
-        return builder
-
-    @contextmanager
-    def prim_func(self, name):
-        thread_local_storage.builder = self
-        with self.ir_builder, self.with_frame(tir.prim_func()):
-            tir.func_name(name)
-            yield
-
-    @contextmanager
-    def macro(self, name=None):
-        if self.find_frame_idx(BoolOpFrame) is not None:
-            raise RuntimeError(
-                f"Macro `{name}` is used inside boolean expressions, "
-                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs")
-        save = self.name_inside_frame
-        self.name_inside_frame = {}
-        with self.with_frame(MacroFrame()):
-            yield
-        self.name_inside_frame = save
-
-    def get(self):
-        return self.ir_builder.get()
-
-    def find_frame_idx(self, frame: type | tuple[type, ...], start=0) -> int | None:
-        for idx in reversed(range(start, len(self.frames))):
-            f = self.frames[idx]
-            if isinstance(f, frame):
-                return idx
-
-    def enter_frame(self, frame: AbstractContextManager[Any]):
-        self.frames.append(frame)
-        return frame.__enter__()
-
-    def check_continue_break(self):
-        idx = self.find_frame_idx(ContinueOrBreak)
-        if idx is not None:
-            logger.warning(
-                'Writing code after continue/break may cause undefined behavior in tilelang.',
-                stack_info=True,
-                stacklevel=3)
-
-    @contextmanager
-    def with_frame(self, frame: AbstractContextManager[Any] | None):
-        pop_idx = len(self.frames)
-        yield self.enter_frame(frame)
-        while len(self.frames) > pop_idx:
-            self.frames.pop().__exit__(None, None, None)
-
-    class _has_if_frame:
-        ...
-
-    def ctx_if(self, cond):
-        self.check_continue_break()
-        cond = unwrap_cond(cond)
-        if isinstance(cond, PrimExpr):
-            with self.with_frame(tir.If(cond)):
-                yield self._has_if_frame
-        else:
-            with self.with_frame(ConstIfFrame()):
-                yield cond
-
-    def ctx_then(self, val):
-        if val is self._has_if_frame:
-            with self.with_frame(tir.Then()):
-                yield
-        else:
-            with self.with_frame(BlockFrame()):
-                if val:
-                    yield
-
-    def ctx_else(self, val):
-        if val is self._has_if_frame:
-            with self.with_frame(tir.Else()):
-                yield
-        else:
-            with self.with_frame(BlockFrame()):
-                if not val:
-                    yield
-
-    def eval(self, val: Any):
-        val = unwrap_expr(val)
-        if val is None:
-            pass
-        elif isinstance(val, tir.frame.IRBuilderFrame):
-            if isinstance(val, tir.frame.ForFrame):
-                logger.warning(
-                    'Evaluating a for frame may cause undefined behavior in tilelang.',
-                    stack_info=True,
-                    stacklevel=1,
-                )
-            self.enter_frame(val)
-        elif isinstance(val, PrimExpr):
-            tir.evaluate(val)
-        elif isinstance(val, (int, bool)):
-            tir.evaluate(tvm.tir.const(val))
-        elif isinstance(val, str):
-            pass
-        elif isinstance(val, tvm.tir.stmt.BufferStore):
-            tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
-        elif not isinstance(val, tvm.tir.Buffer):
-            raise TypeError(f"Unsupported eval value: {val} of type {type(val)}")
-
-    def ctx_for(self, it):
-        self.check_continue_break()
-        it = unwrap_expr(it)
-        if isinstance(it, SerialForWithStep):
-            # Validate and compute the trip count before constructing the frame
-            if isinstance(it.step, (int, IntImm)):
-                step_value = it.step if isinstance(it.step, int) else it.step.value
-                if step_value == 0:
-                    raise ValueError('Invalid stepped serial: step must be non-zero')
-                if step_value > 0:
-                    real_stop = tir.ceildiv(it.stop - it.start, step_value)
-                else:
-                    real_stop = tir.ceildiv(it.start - it.stop, -step_value)
-            else:
-                logger.warning(
-                    f'Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang'
-                )
-                real_stop = tir.ceildiv(it.stop - it.start, it.step)
-            real_frame = tir.serial(real_stop, annotations=it.annotations)
-            with self.with_frame(real_frame) as v:
-                IRBuilder.name('_tmp', v)
-                yield it.start + v * it.step
-        else:
-            if not isinstance(it, tir.frame.ForFrame):
-                raise TypeError(
-                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
-                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding")
-            with self.with_frame(it) as v:
-                yield v
-
-    def ctx_continue(self):
-        self.check_continue_break()
-        # add a dummy frame for checking code after continue/break
-        self.enter_frame(ContinueFrame())
-        tir.evaluate(tir.continue_loop())
-
-    def ctx_break(self):
-        self.check_continue_break()
-        # add a dummy frame for checking code after continue/break
-        self.enter_frame(BreakFrame())
-        tir.evaluate(tir.break_loop())
-
-    def ctx_while(self, cond):
-        self.check_continue_break()
-        raise RuntimeError("while loops are not supported in TileLang builder")
-
-    def bind(self, name, value, annot=BaseBuilder.empty):
-        self.check_continue_break()
-        locals = self.get_parent_locals()
-        orig_value = locals.get(name, None)
-        # annotation like tl.float32
-        # temporarily disable annotation based var declaration, for better pull request separation
-        # if callable(annot):
-        #     annot_val = annot()
-        #     if isinstance(annot_val, tir.Var):
-        #         orig_value = tir.alloc_buffer((1,), dtype=annot_val.dtype, scope='local.var')
-        #         IRBuilder.name(name, orig_value)
-        #         if isinstance(value, EllipsisType) or value is self.empty:
-        #             return orig_value
-        #         elif isinstance(value, (int, float, IntImm, FloatImm)):
-        #             tir.block_attr(
-        #                 {'tl.local_var_init': {
-        #                     orig_value.data: tvm.runtime.convert(value)
-        #                 }})
-        #             return orig_value
-        # if orig_value is a local.var, we use buffer_store to modify it immutably
-        #   however, if rvalue is also a local.var, this is a new binding,
-        #   we should not use buffer_store, and bind it instead
-        #   ```py
-        #   a = tl.alloc_var('float32')  # bind var `a`
-        #   a = tl.alloc_var('float32')  # bind a new var `a_1`
-        #   b = a                        # get value of var `b = a_1[0]``
-        #   c = tl.alloc_var('float32')  # bind var `c`
-        #   c = a                        # get and assign `c[0] = a_1[0]`
-        #   ```
-        if is_var(orig_value) and not is_var(value):
-            tir.buffer_store(orig_value, value, 0)
-            return orig_value
-        res = self.bind_immutable(name, value)
-        if name != '_':
-            frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
-            assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
-            if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
-                logger.warning(
-                    f'Variable `{name}` shadows another declared value, Are you forgetting to allocate it as a var?',
-                    stack_info=True,
-                    stacklevel=2,
-                )
-            self.name_inside_frame[name] = self.frames[frame]
-        return res
-
-    def unwrap_value(self, value):
-        value = unwrap_expr(value)
-        # handle bx, by = tl.Kernel(128, 128), rval is frame
-        if isinstance(value, tir.frame.IRBuilderFrame):
-            return self.enter_frame(value)
-        else:
-            return value
-
-    def bind_immutable(self, name, value):
-        if name == '_':
-            # use _tmp to make the generated tir more readable
-            name = "_tmp"
-        if isinstance(value, tir.meta_var):
-            return value.value
-        elif isinstance(value, tir.frame.IRBuilderFrame):
-            if isinstance(value, tir.frame.ForFrame):
-                logger.warning(
-                    'Binding a for frame to variable may cause undefined behavior in tilelang.',
-                    stack_info=True,
-                    stacklevel=2,
-                )
-            return self.enter_frame(value)
-        elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
-            IRBuilder.name(name, value)
-            return value
-        elif isinstance(value, (tuple, list, tvm.ffi.Array)):
-            return value
-        else:
-            try:
-                value = tvm.runtime.convert(value)
-            except TypeError:
-                return value
-            frame = tir.LetStmt(value)
-            var = frame.var
-            IRBuilder.name(name, var)
-            return self.enter_frame(frame)
-
-    def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
-        self.check_continue_break()
-        if annot is not self.empty:
-            logger.warning(
-                "Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
-        if isinstance(lval, Buffer):
-            tir.buffer_store(lval, value, sl)
-        else:
-            return super().assign_slice(lval, sl, value)
-
-    def aug_assign(self, op, target, aug_value):
-        self.check_continue_break()
-        if is_var(target):
-            tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
-            return target
-        elif isinstance(target, Buffer):
-            raise RuntimeError("Augmented assignment is not supported for Buffer")
-        else:
-            return super().aug_assign(op, target, aug_value)
-
-    def aug_assign_slice(self, op, target, sl, aug_value):
-        self.check_continue_break()
-        if isinstance(target, Buffer):
-            tir.buffer_store(target, eval_op(op, target[sl], aug_value), sl)
-        else:
-            return super().aug_assign_slice(op, target, sl, aug_value)
-
-    def boolop(self, op, left, right):
-        left = unwrap_cond(left)
-        if isinstance(left, PrimExpr):
-            with self.with_frame(BoolOpFrame()):
-                if op == 'And':
-                    return tir.And(left, right())
-                if op == 'Or':
-                    return tir.Or(left, right())
-            raise RuntimeError(f"Unsupported boolean operator: {op}")
-        else:
-            return super().boolop(op, left, right)
-
-    def ifexp(self, cond, then, otherwise):
-        cond = unwrap_cond(cond)
-        if isinstance(cond, PrimExpr):
-            with self.with_frame(BoolOpFrame()):
-                return tir.if_then_else(cond, then(), otherwise())
-        else:
-            return super().ifexp(cond, then, otherwise)
-
-    def ret(self, value):
-        self.check_continue_break()
-        # handle return T.alloc_var()
-        value = self.unwrap_value(value)
-        last_macro = self.find_frame_idx(MacroFrame)
-        if last_macro is not None:
-            frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
-            if frame is not None:
-                raise NotImplementedError(
-                    "Return from control flow is not supported yet. \n"
-                    "You should allocate a var before the control flow, assign value inside the blocks, \n"
-                    "and return the var after the control flow. i.e.\n"
-                    "```\n"
-                    "@T.macro\n" \
-                    "def my_macro(cond):\n"
-                    "    a = T.alloc_var(T.float16)\n"
-                    "    if cond:\n"
-                    "        a = 1.0\n"
-                    "    return a\n"
-                    "```"
-                )
-        return value
-
-    def ctx_with(self, ctx):
-        self.check_continue_break()
-        if isinstance(ctx, tir.frame.IRBuilderFrame):
-            return self.with_frame(ctx)
-        else:
-            return super().ctx_with(ctx)
-
-    def assert_expr(self, cond, msg):
-        self.check_continue_break()
-        cond = unwrap_cond(cond)
-        if isinstance(cond, PrimExpr):
-            self.enter_frame(tir.Assert(cond, msg))
-        elif not cond:
-            raise AssertionError(msg)
-
-    def rval(self, name: str, value: Any) -> Any:
-        if name in self.name_inside_frame:
-            frame = self.name_inside_frame[name]
-            if frame not in self.frames:
-                raise RuntimeError(
-                    f"Use immutable variable `{name}` outside its defining region, did you forget **alloc_var**?\n"
-                    f"variable `{name}` is defined in frame: {frame}, current frames: {self.frames}."
-                )
-        return self.unwrap_value(value)
-
-    def arg(self, name, value):
-        if self.find_frame_idx(MacroFrame) is not None:
-            if isinstance(value, (PrimExpr, int, float)):
-                return self.bind(name, value)
-            else:
-                return value
-        if isinstance(value, (Buffer, Var)):
-            return tir.arg(name, value)
-        elif value is self.empty:
-            raise ValueError(f'Argument `{name}` is not annotated')
-        # elif isinstance(value, Hashable):
-        #     return value
-        else:
-            raise TypeError(
-                f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
-
-    def override(self, name: str):
-        from tilelang.language import serial
-        if name == 'range':
-            return serial
-        raise ValueError(f'Unknown override: {name}')
-
-
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
-
-if TYPE_CHECKING:
-
-    class PrimFunc(Generic[_P, _T], tvm.tir.PrimFunc):
-        params: list[tvm.tir.Var | tvm.tir.Buffer]
-        body: tvm.tir.Stmt
-        ret_type: tvm.ir.Type
-        buffer_map: Map[tvm.tir.Var, tvm.tir.Buffer]
-        attrs: tvm.Attrs | None
-        span: Span | None
-        ir_gen: IRGenerator[_P, _T] | None
-        source: str | None
-        orig_func: Callable[_P, _T] | None
-else:
-    PrimFunc = tvm.tir.PrimFunc
-
-
-@dataclass
-class Macro(Generic[_P, _T]):
-    name: str
-    orig_func: Callable[_P, _T]
-    ir_gen: IRGenerator[_P, _T]
-
-    @property
-    def source(self) -> str:
-        return self.ir_gen.source
-
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-        builder = Builder.current()
-        with builder.macro(self.name):
-            res = self.ir_gen.gen(builder)(*args, **kwargs)
-        return res
-
-
-def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
-    """
-    Decorator that converts a Python function into a TileLang macro.
-    TileLang macro is very similar to PrimFunc, it can be used in prim_func or another macro.
-    Parameters
-    ----------
-    func : Callable[_P, _T]
-        The Python function to be converted into a macro. This function will be analyzed
-        and transformed into an IR generation function. The function can take any parameters
-        (_P) and return any type (_T).
-    Returns
-    -------
-    Macro[_P, _T]
-        A Macro object that wraps the original function with IR generation capabilities.
-        The returned Macro preserves the original function's signature (parameters _P and
-        return type _T) while adding metaprogramming capabilities.
-    Example:
-    --------
-        >>> @macro
-        ... def my_macro(x: T.int32) -> T.int32:
-        ...    return x ** 2
-        >>> @prim_func
-        ... def my_func(A: T.Tensor((10,), T.int32), B: T.Tensor((10,), T.int32)):
-        ...    with T.Kernel(1) as _:
-        ...        for i in T.serial(10):
-        ...            B[i] = my_macro(A[i])
-    See Also
-    --------
-    Macro : The class that wraps macro functions
-    mutate : The function that transforms Python code into IR generators
-    """
-
-    def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
-        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func))
-
-    return impl(func) if func is not None else impl
-
-
-from typing import _eval_type
-
-
-def get_type_hints(func):
-    annot = getattr(func, '__annotations__', None)
-    if annot is None:
-        raise TypeError(f'Failed to get function type hints, {func} is not a function')
-    hints = {}
-    # Build eval namespaces from function globals plus captured closure variables
-    # This lets annotations reference symbols like `n`, `h`, or dtype vars
-    # defined in the outer scope of a nested function.
-    globalns = dict(getattr(func, '__globals__', {}))
-    localns = dict(globalns)
-    try:
-        freevars = getattr(func.__code__, 'co_freevars', ())
-        cells = getattr(func, '__closure__', ()) or ()
-        closure_bindings = {
-            name: cell.cell_contents for name, cell in zip(freevars, cells) if name not in localns
-        }
-        if closure_bindings:
-            localns.update(closure_bindings)
-            # Also update globals so ForwardRef eval sees them uniformly
-            globalns.update(closure_bindings)
-    except Exception:
-        # Be permissive: absence or access issues with closure shouldn't crash
-        pass
-
-    for name, value in annot.items():
-        if name == 'return':
-            continue
-        if isinstance(value, tvm.DataType):
-            hints[name] = value
-            continue
-        if value is None:
-            value = type(None)
-        if isinstance(value, str):
-            # Handle simple dtype aliases like T.float32 appearing as strings
-            # Evaluate directly only when it matches known dtypes
-            try:
-                _, v = value.split('.', maxsplit=1)
-            except ValueError:
-                v = value
-            if v in dt._all_dtypes:
-                try:
-                    hints[name] = eval(value, globalns, localns)
-                    continue
-                except Exception:
-                    pass
-            value = ForwardRef(value, is_argument=True, is_class=False)
-        hints[name] = _eval_type(value, globalns=globalns, localns=localns)
-    return hints
-
-
-def _is_static_annot(annot: Any) -> bool:
-    return isinstance(annot, (dt.dtype, Buffer, Var))
-
-
-def prim_func(func: Callable[_P, _T] = None,
-              *,
-              generator: bool = False) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
-    """
-    Decorator to create a primitive function (PrimFunc) for TileLang IR generation.
-    This decorator transforms a Python function into a TileLang primitive function by analyzing
-    its type annotations and generating intermediate representation (IR) code. It supports both
-    immediate construction (when all parameters are statically annotated) and generator mode
-    (for dynamic construction).
-    Parameters
-    ----------
-    func : Callable[_P, _T], optional
-        The function to be decorated. Can be None when using decorator with arguments.
-    generator : bool, default=False
-        If True, returns a generator function that creates PrimFunc instances on demand.
-        If False, attempts to create a PrimFunc immediately using type annotations.
-    Returns
-    -------
-    PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]
-        - If `generator=False` and all parameters are statically annotated: returns a PrimFunc instance
-        - If `generator=True`: returns a callable that generates PrimFunc instances when invoked
-        - If used without parentheses: returns the decorator implementation function
-    Examples
-    --------
-    Static annotation mode (immediate construction):
-    >>> @prim_func
-    ... def add_kernel(A: T.Buffer((128,), T.float32),
-    ...                B: T.Buffer((128,), T.float32)):
-    ...     for i in T.grid(128):
-    ...         B[i] = A[i] + 1.0
-    Generator mode (dynamic construction):
-    >>> @prim_func(generator=True)
-    ... def dynamic_kernel(A=T.Tensor((128,), T.float32)):
-    ...     # function body
-    ...     pass
-    >>> kernel_instance = dynamic_kernel()
-    With custom parameters:
-    >>> @prim_func(generator=True)
-    ... def parameterized_kernel(size: int = 128):
-    ...     # function body using size parameter
-    ...     pass
-    >>> kernel = parameterized_kernel(size=256)
-    See Also
-    --------
-    Builder : The IR builder class used for constructing primitive functions
-    mutate : Function used to generate IR from the decorated function
-    """
-
-    def impl(func: Callable[_P, _T]) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
-        sig = inspect.signature(func)
-        annot = get_type_hints(func)
-
-        for k in annot:
-            if callable(annot[k]):
-                annot[k] = annot[k]()
-
-        # check whether all arguments are annotated
-        all_arg_annotated = all([x in annot for x in sig.parameters])
-        # check whether all annotations are Buffer/Var/dtype
-        all_annot_are_static = all([_is_static_annot(x) for x in annot.values()])
-        ir_gen = mutate(func)
-
-        def prim_func_generator(*args, **kwargs):
-            builder = Builder()
-            with builder.prim_func(func.__name__):
-                ir_gen.gen(builder)(*args, **kwargs)
-            res = builder.get()
-            res.ir_gen = ir_gen
-            res.source = ir_gen.source
-            res.orig_func = func
-            return res
-
-        prim_func_generator.ir_gen = ir_gen
-        prim_func_generator.source = ir_gen.source
-        prim_func_generator.orig_func = func
-
-        if generator:
-            return prim_func_generator
-
-        if all_arg_annotated and all_annot_are_static:
-            return prim_func_generator(**annot)
-        else:
-            raise ValueError(
-                "Some arguments are not supported or statically annotated, \n"
-                "please check the annotations or set generator=True to get a prim_func generator.\n"
-                f"Argument Annotations: {annot}\n"
-                "Example usage of generator:\n"
-                "```py\n"
-                "@prim_func(generator=True)\n"
-                "def my_func(a=T.Tensor((128,), T.float32)): ...\n"
-                "return my_func()\n"
-                "```")
-
-    return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
deleted file mode 100644
index 2161e3770..000000000
--- a/tilelang/language/v2/dtypes.py
+++ /dev/null
@@ -1,617 +0,0 @@
-from tilelang import tvm
-from tvm import ir
-import torch
-import ctypes
-from typing import TYPE_CHECKING, Union
-from tvm import tir
-import tvm.script.ir_builder.tir._ffi_api as tb_ffi
-
-dtype = tvm.DataType
-# Python 3.9 compatibility: avoid PEP 604 unions at runtime
-AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
-
-# Base dtype conversion list
-_dtype_cvt_base = [
-    (None, 'handle', ctypes.c_long, 'long', None),  # use long to repr void*
-    (bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.short, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.long, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.half, 'float16', None, None, 'Float16'),
-    (torch.float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.double, 'float64', ctypes.c_double, 'double', 'Float64'),
-
-    #   (pytype,                'tvm dtype str',    'ctypes dtype',     'cffi dtype')
-    (torch.bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (torch.int8, 'int8', ctypes.c_int8, 'char', 'Int8'),
-    (torch.int16, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int32, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.int64, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.uint8, 'uint8', ctypes.c_uint8, 'unsigned char', 'UInt8'),
-    (torch.uint16, 'uint16', ctypes.c_uint16, 'unsigned short', 'UInt16'),
-    (torch.uint32, 'uint32', ctypes.c_uint32, 'unsigned int', 'UInt32'),
-    (torch.uint64, 'uint64', ctypes.c_uint64, 'unsigned long long', 'UInt64'),
-    (torch.float16, 'float16', None, None, 'Float16'),
-    (torch.float32, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.float64, 'float64', ctypes.c_double, 'double', 'Float64'),
-    (None, 'float8_e4m3', None, None, 'Float8E4M3'),
-    (torch.bfloat16, 'bfloat16', None, None, 'BFloat16'),
-]
-
-# Dynamically add fp8-related types if they exist in torch
-_fp8_dtype_mappings = [
-    ('float8_e4m3fn', 'Float8E4M3FN'),
-    ('float8_e4m3fnuz', 'Float8E4M3FNUZ'),
-    ('float8_e5m2', 'Float8E5M2'),
-    ('float8_e5m2fnuz', 'Float8E5M2FNUZ'),
-    ('float8_e8m0fnu', 'Float8E8M0FNU'),
-]
-
-_dtype_cvt = list(_dtype_cvt_base)
-for torch_attr_name, tvm_name in _fp8_dtype_mappings:
-    if hasattr(torch, torch_attr_name):
-        torch_dtype = getattr(torch, torch_attr_name)
-        _dtype_cvt.append((torch_dtype, torch_attr_name, None, None, tvm_name))
-
-
-def _create_type_mapper(sidx, didx, smapper=lambda x: x, dmapper=lambda x: x):
-    return {
-        smapper(item[sidx]): dmapper(item[didx])
-        for item in _dtype_cvt
-        if item[didx] is not None and item[sidx] is not None
-    }
-
-
-_dtype_py2tvmstr = _create_type_mapper(0, 1)
-_dtype_tvmstr2fficall = _create_type_mapper(1, 4, dmapper=lambda x: getattr(tb_ffi, x))
-_dtype_tvm2py = _create_type_mapper(1, 0, lambda x: dtype(x))
-_dtype_tvm2ctype = _create_type_mapper(1, 2, lambda x: dtype(x))
-_dtype_tvm2cffi = _create_type_mapper(1, 3, lambda x: dtype(x))
-
-
-def __dtype_eq__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__eq__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__eq__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
-
-
-def __dtype_ne__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__ne__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__ne__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
-
-
-def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
-    if self in _dtype_tvmstr2fficall:
-        return _dtype_tvmstr2fficall[self](expr, is_size_var)
-    # try to construct the ffi call
-    if self.startswith('uint'):
-        val = 'UInt' + self[4:]
-    elif self.startswith('int'):
-        val = 'Int' + self[3:]
-    elif self.startswith('float'):
-        val = 'Float' + self[5:]
-    elif self.startswith('bfloat'):
-        val = 'BFloat' + self[6:]
-    else:
-        raise TypeError(f'Invalid type {self}')
-    if '_' in val:
-        first, second = val.split('_', maxsplit=1)
-        val = first + second.upper()
-    call = getattr(tb_ffi, val, None)
-    if call is None:
-        raise TypeError(f"Convert to datatype `{self}` is not supported by tvm\n"
-                        f"calling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`")
-    return call(expr, is_size_var)
-
-
-__orig_dtype_new = dtype.__new__
-
-
-def __dtype_new__(cls, value: AnyDType) -> dtype:
-    if isinstance(value, str):
-        return __orig_dtype_new(cls, value)
-    elif value in _dtype_py2tvmstr:
-        return __orig_dtype_new(cls, _dtype_py2tvmstr[value])
-    else:
-        expected = set(list(_dtype_py2tvmstr.keys()) + list(_dtype_tvmstr2fficall.values()))
-        raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
-
-
-dtype.__eq__ = __dtype_eq__
-dtype.__req__ = __dtype_eq__
-dtype.__ne__ = __dtype_ne__
-dtype.__rne__ = __dtype_ne__
-dtype.__call__ = __dtype_call__
-dtype.__new__ = __dtype_new__
-
-
-def get_tvm_dtype(value: AnyDType) -> dtype:
-    if isinstance(value, (dtype, ir.Type)):
-        return value
-    return dtype(value)
-
-
-if TYPE_CHECKING:
-
-    # yapf: disable
-    class bool(dtype): ...
-    class short(dtype): ...
-    class int(dtype): ...
-    class long(dtype): ...
-    class half(dtype): ...
-    class float(dtype): ...
-    class double(dtype): ...
-    class int8(dtype): ...
-    class int16(dtype): ...
-    class int32(dtype): ...
-    class int64(dtype): ...
-    class int8x4(dtype): ...
-    class int16x4(dtype): ...
-    class int32x4(dtype): ...
-    class int64x4(dtype): ...
-    class int8x8(dtype): ...
-    class int16x8(dtype): ...
-    class int32x8(dtype): ...
-    class int64x8(dtype): ...
-    class int8x16(dtype): ...
-    class int16x16(dtype): ...
-    class int32x16(dtype): ...
-    class int64x16(dtype): ...
-    class int8x32(dtype): ...
-    class int16x32(dtype): ...
-    class int32x32(dtype): ...
-    class int64x32(dtype): ...
-    class int8x64(dtype): ...
-    class int16x64(dtype): ...
-    class int32x64(dtype): ...
-    class int64x64(dtype): ...
-    class uint8(dtype): ...
-    class uint16(dtype): ...
-    class uint32(dtype): ...
-    class uint64(dtype): ...
-    class uint8x4(dtype): ...
-    class uint16x4(dtype): ...
-    class uint32x4(dtype): ...
-    class uint64x4(dtype): ...
-    class uint8x8(dtype): ...
-    class uint16x8(dtype): ...
-    class uint32x8(dtype): ...
-    class uint64x8(dtype): ...
-    class uint8x16(dtype): ...
-    class uint16x16(dtype): ...
-    class uint32x16(dtype): ...
-    class uint64x16(dtype): ...
-    class uint8x32(dtype): ...
-    class uint16x32(dtype): ...
-    class uint32x32(dtype): ...
-    class uint64x32(dtype): ...
-    class uint8x64(dtype): ...
-    class uint16x64(dtype): ...
-    class uint32x64(dtype): ...
-    class uint64x64(dtype): ...
-    class float16(dtype): ...
-    class float32(dtype): ...
-    class float64(dtype): ...
-    class float16x2(dtype): ...
-    class float32x2(dtype): ...
-    class float64x2(dtype): ...
-    class float16x4(dtype): ...
-    class float32x4(dtype): ...
-    class float64x4(dtype): ...
-    class float16x8(dtype): ...
-    class float32x8(dtype): ...
-    class float64x8(dtype): ...
-    class float16x16(dtype): ...
-    class float32x16(dtype): ...
-    class float64x16(dtype): ...
-    class float16x32(dtype): ...
-    class float32x32(dtype): ...
-    class float64x32(dtype): ...
-    class float16x64(dtype): ...
-    class float32x64(dtype): ...
-    class float64x64(dtype): ...
-    class float8_e3m4(dtype): ...
-    class float8_e3m4x2(dtype): ...
-    class float8_e3m4x4(dtype): ...
-    class float8_e3m4x8(dtype): ...
-    class float8_e3m4x16(dtype): ...
-    class float8_e3m4x32(dtype): ...
-    class float8_e3m4x64(dtype): ...
-    class float8_e4m3(dtype): ...
-    class float8_e4m3x2(dtype): ...
-    class float8_e4m3x4(dtype): ...
-    class float8_e4m3x8(dtype): ...
-    class float8_e4m3x16(dtype): ...
-    class float8_e4m3x32(dtype): ...
-    class float8_e4m3x64(dtype): ...
-    class float8_e4m3b11fnuz(dtype): ...
-    class float8_e4m3b11fnuzx2(dtype): ...
-    class float8_e4m3b11fnuzx4(dtype): ...
-    class float8_e4m3b11fnuzx8(dtype): ...
-    class float8_e4m3b11fnuzx16(dtype): ...
-    class float8_e4m3b11fnuzx32(dtype): ...
-    class float8_e4m3b11fnuzx64(dtype): ...
-    class float8_e4m3fn(dtype): ...
-    class float8_e4m3fnx2(dtype): ...
-    class float8_e4m3fnx4(dtype): ...
-    class float8_e4m3fnx8(dtype): ...
-    class float8_e4m3fnx16(dtype): ...
-    class float8_e4m3fnx32(dtype): ...
-    class float8_e4m3fnx64(dtype): ...
-    class float8_e4m3fnuz(dtype): ...
-    class float8_e4m3fnuzx2(dtype): ...
-    class float8_e4m3fnuzx4(dtype): ...
-    class float8_e4m3fnuzx8(dtype): ...
-    class float8_e4m3fnuzx16(dtype): ...
-    class float8_e4m3fnuzx32(dtype): ...
-    class float8_e4m3fnuzx64(dtype): ...
-    class float8_e5m2(dtype): ...
-    class float8_e5m2x2(dtype): ...
-    class float8_e5m2x4(dtype): ...
-    class float8_e5m2x8(dtype): ...
-    class float8_e5m2x16(dtype): ...
-    class float8_e5m2x32(dtype): ...
-    class float8_e5m2x64(dtype): ...
-    class float8_e5m2fnuz(dtype): ...
-    class float8_e5m2fnuzx2(dtype): ...
-    class float8_e5m2fnuzx4(dtype): ...
-    class float8_e5m2fnuzx8(dtype): ...
-    class float8_e5m2fnuzx16(dtype): ...
-    class float8_e5m2fnuzx32(dtype): ...
-    class float8_e5m2fnuzx64(dtype): ...
-    class float8_e8m0fnu(dtype): ...
-    class float8_e8m0fnux2(dtype): ...
-    class float8_e8m0fnux4(dtype): ...
-    class float8_e8m0fnux8(dtype): ...
-    class float8_e8m0fnux16(dtype): ...
-    class float8_e8m0fnux32(dtype): ...
-    class float8_e8m0fnux64(dtype): ...
-    class float6_e2m3fn(dtype): ...
-    class float6_e2m3fnx2(dtype): ...
-    class float6_e2m3fnx4(dtype): ...
-    class float6_e2m3fnx8(dtype): ...
-    class float6_e2m3fnx16(dtype): ...
-    class float6_e2m3fnx32(dtype): ...
-    class float6_e2m3fnx64(dtype): ...
-    class float6_e3m2fn(dtype): ...
-    class float6_e3m2fnx2(dtype): ...
-    class float6_e3m2fnx4(dtype): ...
-    class float6_e3m2fnx8(dtype): ...
-    class float6_e3m2fnx16(dtype): ...
-    class float6_e3m2fnx32(dtype): ...
-    class float6_e3m2fnx64(dtype): ...
-    class float4_e2m1fn(dtype): ...
-    class float4_e2m1fnx2(dtype): ...
-    class float4_e2m1fnx4(dtype): ...
-    class float4_e2m1fnx8(dtype): ...
-    class float4_e2m1fnx16(dtype): ...
-    class float4_e2m1fnx32(dtype): ...
-    class float4_e2m1fnx64(dtype): ...
-    class bfloat16(dtype): ...
-    # yapf: enable
-
-else:
-    bool = dtype('bool')
-    short = dtype('int16')
-    int = dtype('int32')
-    long = dtype('int64')
-    half = dtype('float16')
-    float = dtype('float32')
-    double = dtype('float64')
-    int8 = dtype('int8')
-    int16 = dtype('int16')
-    int32 = dtype('int32')
-    int64 = dtype('int64')
-    int8x4 = dtype('int8x4')
-    int16x4 = dtype('int16x4')
-    int32x4 = dtype('int32x4')
-    int64x4 = dtype('int64x4')
-    int8x8 = dtype('int8x8')
-    int16x8 = dtype('int16x8')
-    int32x8 = dtype('int32x8')
-    int64x8 = dtype('int64x8')
-    int8x16 = dtype('int8x16')
-    int16x16 = dtype('int16x16')
-    int32x16 = dtype('int32x16')
-    int64x16 = dtype('int64x16')
-    int8x32 = dtype('int8x32')
-    int16x32 = dtype('int16x32')
-    int32x32 = dtype('int32x32')
-    int64x32 = dtype('int64x32')
-    int8x64 = dtype('int8x64')
-    int16x64 = dtype('int16x64')
-    int32x64 = dtype('int32x64')
-    int64x64 = dtype('int64x64')
-    uint8 = dtype('uint8')
-    uint16 = dtype('uint16')
-    uint32 = dtype('uint32')
-    uint64 = dtype('uint64')
-    uint8x4 = dtype('uint8x4')
-    uint16x4 = dtype('uint16x4')
-    uint32x4 = dtype('uint32x4')
-    uint64x4 = dtype('uint64x4')
-    uint8x8 = dtype('uint8x8')
-    uint16x8 = dtype('uint16x8')
-    uint32x8 = dtype('uint32x8')
-    uint64x8 = dtype('uint64x8')
-    uint8x16 = dtype('uint8x16')
-    uint16x16 = dtype('uint16x16')
-    uint32x16 = dtype('uint32x16')
-    uint64x16 = dtype('uint64x16')
-    uint8x32 = dtype('uint8x32')
-    uint16x32 = dtype('uint16x32')
-    uint32x32 = dtype('uint32x32')
-    uint64x32 = dtype('uint64x32')
-    uint8x64 = dtype('uint8x64')
-    uint16x64 = dtype('uint16x64')
-    uint32x64 = dtype('uint32x64')
-    uint64x64 = dtype('uint64x64')
-    float16 = dtype('float16')
-    float32 = dtype('float32')
-    float64 = dtype('float64')
-    float16x2 = dtype('float16x2')
-    float32x2 = dtype('float32x2')
-    float64x2 = dtype('float64x2')
-    float16x4 = dtype('float16x4')
-    float32x4 = dtype('float32x4')
-    float64x4 = dtype('float64x4')
-    float16x8 = dtype('float16x8')
-    float32x8 = dtype('float32x8')
-    float64x8 = dtype('float64x8')
-    float16x16 = dtype('float16x16')
-    float32x16 = dtype('float32x16')
-    float64x16 = dtype('float64x16')
-    float16x32 = dtype('float16x32')
-    float32x32 = dtype('float32x32')
-    float64x32 = dtype('float64x32')
-    float16x64 = dtype('float16x64')
-    float32x64 = dtype('float32x64')
-    float64x64 = dtype('float64x64')
-    float8_e3m4 = dtype('float8_e3m4')
-    float8_e3m4x2 = dtype('float8_e3m4x2')
-    float8_e3m4x4 = dtype('float8_e3m4x4')
-    float8_e3m4x8 = dtype('float8_e3m4x8')
-    float8_e3m4x16 = dtype('float8_e3m4x16')
-    float8_e3m4x32 = dtype('float8_e3m4x32')
-    float8_e3m4x64 = dtype('float8_e3m4x64')
-    float8_e4m3 = dtype('float8_e4m3')
-    float8_e4m3x2 = dtype('float8_e4m3x2')
-    float8_e4m3x4 = dtype('float8_e4m3x4')
-    float8_e4m3x8 = dtype('float8_e4m3x8')
-    float8_e4m3x16 = dtype('float8_e4m3x16')
-    float8_e4m3x32 = dtype('float8_e4m3x32')
-    float8_e4m3x64 = dtype('float8_e4m3x64')
-    float8_e4m3b11fnuz = dtype('float8_e4m3b11fnuz')
-    float8_e4m3b11fnuzx2 = dtype('float8_e4m3b11fnuzx2')
-    float8_e4m3b11fnuzx4 = dtype('float8_e4m3b11fnuzx4')
-    float8_e4m3b11fnuzx8 = dtype('float8_e4m3b11fnuzx8')
-    float8_e4m3b11fnuzx16 = dtype('float8_e4m3b11fnuzx16')
-    float8_e4m3b11fnuzx32 = dtype('float8_e4m3b11fnuzx32')
-    float8_e4m3b11fnuzx64 = dtype('float8_e4m3b11fnuzx64')
-    float8_e4m3fn = dtype('float8_e4m3fn')
-    float8_e4m3fnx2 = dtype('float8_e4m3fnx2')
-    float8_e4m3fnx4 = dtype('float8_e4m3fnx4')
-    float8_e4m3fnx8 = dtype('float8_e4m3fnx8')
-    float8_e4m3fnx16 = dtype('float8_e4m3fnx16')
-    float8_e4m3fnx32 = dtype('float8_e4m3fnx32')
-    float8_e4m3fnx64 = dtype('float8_e4m3fnx64')
-    float8_e4m3fnuz = dtype('float8_e4m3fnuz')
-    float8_e4m3fnuzx2 = dtype('float8_e4m3fnuzx2')
-    float8_e4m3fnuzx4 = dtype('float8_e4m3fnuzx4')
-    float8_e4m3fnuzx8 = dtype('float8_e4m3fnuzx8')
-    float8_e4m3fnuzx16 = dtype('float8_e4m3fnuzx16')
-    float8_e4m3fnuzx32 = dtype('float8_e4m3fnuzx32')
-    float8_e4m3fnuzx64 = dtype('float8_e4m3fnuzx64')
-    float8_e5m2 = dtype('float8_e5m2')
-    float8_e5m2x2 = dtype('float8_e5m2x2')
-    float8_e5m2x4 = dtype('float8_e5m2x4')
-    float8_e5m2x8 = dtype('float8_e5m2x8')
-    float8_e5m2x16 = dtype('float8_e5m2x16')
-    float8_e5m2x32 = dtype('float8_e5m2x32')
-    float8_e5m2x64 = dtype('float8_e5m2x64')
-    float8_e5m2fnuz = dtype('float8_e5m2fnuz')
-    float8_e5m2fnuzx2 = dtype('float8_e5m2fnuzx2')
-    float8_e5m2fnuzx4 = dtype('float8_e5m2fnuzx4')
-    float8_e5m2fnuzx8 = dtype('float8_e5m2fnuzx8')
-    float8_e5m2fnuzx16 = dtype('float8_e5m2fnuzx16')
-    float8_e5m2fnuzx32 = dtype('float8_e5m2fnuzx32')
-    float8_e5m2fnuzx64 = dtype('float8_e5m2fnuzx64')
-    float8_e8m0fnu = dtype('float8_e8m0fnu')
-    float8_e8m0fnux2 = dtype('float8_e8m0fnux2')
-    float8_e8m0fnux4 = dtype('float8_e8m0fnux4')
-    float8_e8m0fnux8 = dtype('float8_e8m0fnux8')
-    float8_e8m0fnux16 = dtype('float8_e8m0fnux16')
-    float8_e8m0fnux32 = dtype('float8_e8m0fnux32')
-    float8_e8m0fnux64 = dtype('float8_e8m0fnux64')
-    float6_e2m3fn = dtype('float6_e2m3fn')
-    float6_e2m3fnx2 = dtype('float6_e2m3fnx2')
-    float6_e2m3fnx4 = dtype('float6_e2m3fnx4')
-    float6_e2m3fnx8 = dtype('float6_e2m3fnx8')
-    float6_e2m3fnx16 = dtype('float6_e2m3fnx16')
-    float6_e2m3fnx32 = dtype('float6_e2m3fnx32')
-    float6_e2m3fnx64 = dtype('float6_e2m3fnx64')
-    float6_e3m2fn = dtype('float6_e3m2fn')
-    float6_e3m2fnx2 = dtype('float6_e3m2fnx2')
-    float6_e3m2fnx4 = dtype('float6_e3m2fnx4')
-    float6_e3m2fnx8 = dtype('float6_e3m2fnx8')
-    float6_e3m2fnx16 = dtype('float6_e3m2fnx16')
-    float6_e3m2fnx32 = dtype('float6_e3m2fnx32')
-    float6_e3m2fnx64 = dtype('float6_e3m2fnx64')
-    float4_e2m1fn = dtype('float4_e2m1fn')
-    float4_e2m1fnx2 = dtype('float4_e2m1fnx2')
-    float4_e2m1fnx4 = dtype('float4_e2m1fnx4')
-    float4_e2m1fnx8 = dtype('float4_e2m1fnx8')
-    float4_e2m1fnx16 = dtype('float4_e2m1fnx16')
-    float4_e2m1fnx32 = dtype('float4_e2m1fnx32')
-    float4_e2m1fnx64 = dtype('float4_e2m1fnx64')
-    bfloat16 = dtype('bfloat16')
-
-_all_dtypes = {
-    'bool',
-    'short',
-    'int',
-    'long',
-    'half',
-    'float',
-    'double',
-    'int8',
-    'int16',
-    'int32',
-    'int64',
-    'int8x4',
-    'int16x4',
-    'int32x4',
-    'int64x4',
-    'int8x8',
-    'int16x8',
-    'int32x8',
-    'int64x8',
-    'int8x16',
-    'int16x16',
-    'int32x16',
-    'int64x16',
-    'int8x32',
-    'int16x32',
-    'int32x32',
-    'int64x32',
-    'int8x64',
-    'int16x64',
-    'int32x64',
-    'int64x64',
-    'uint8',
-    'uint16',
-    'uint32',
-    'uint64',
-    'uint8x4',
-    'uint16x4',
-    'uint32x4',
-    'uint64x4',
-    'uint8x8',
-    'uint16x8',
-    'uint32x8',
-    'uint64x8',
-    'uint8x16',
-    'uint16x16',
-    'uint32x16',
-    'uint64x16',
-    'uint8x32',
-    'uint16x32',
-    'uint32x32',
-    'uint64x32',
-    'uint8x64',
-    'uint16x64',
-    'uint32x64',
-    'uint64x64',
-    'float16',
-    'float32',
-    'float64',
-    'float16x2',
-    'float32x2',
-    'float64x2',
-    'float16x4',
-    'float32x4',
-    'float64x4',
-    'float16x8',
-    'float32x8',
-    'float64x8',
-    'float16x16',
-    'float32x16',
-    'float64x16',
-    'float16x32',
-    'float32x32',
-    'float64x32',
-    'float16x64',
-    'float32x64',
-    'float64x64',
-    'float8_e3m4',
-    'float8_e3m4x2',
-    'float8_e3m4x4',
-    'float8_e3m4x8',
-    'float8_e3m4x16',
-    'float8_e3m4x32',
-    'float8_e3m4x64',
-    'float8_e4m3',
-    'float8_e4m3x2',
-    'float8_e4m3x4',
-    'float8_e4m3x8',
-    'float8_e4m3x16',
-    'float8_e4m3x32',
-    'float8_e4m3x64',
-    'float8_e4m3b11fnuz',
-    'float8_e4m3b11fnuzx2',
-    'float8_e4m3b11fnuzx4',
-    'float8_e4m3b11fnuzx8',
-    'float8_e4m3b11fnuzx16',
-    'float8_e4m3b11fnuzx32',
-    'float8_e4m3b11fnuzx64',
-    'float8_e4m3fn',
-    'float8_e4m3fnx2',
-    'float8_e4m3fnx4',
-    'float8_e4m3fnx8',
-    'float8_e4m3fnx16',
-    'float8_e4m3fnx32',
-    'float8_e4m3fnx64',
-    'float8_e4m3fnuz',
-    'float8_e4m3fnuzx2',
-    'float8_e4m3fnuzx4',
-    'float8_e4m3fnuzx8',
-    'float8_e4m3fnuzx16',
-    'float8_e4m3fnuzx32',
-    'float8_e4m3fnuzx64',
-    'float8_e5m2',
-    'float8_e5m2x2',
-    'float8_e5m2x4',
-    'float8_e5m2x8',
-    'float8_e5m2x16',
-    'float8_e5m2x32',
-    'float8_e5m2x64',
-    'float8_e5m2fnuz',
-    'float8_e5m2fnuzx2',
-    'float8_e5m2fnuzx4',
-    'float8_e5m2fnuzx8',
-    'float8_e5m2fnuzx16',
-    'float8_e5m2fnuzx32',
-    'float8_e5m2fnuzx64',
-    'float8_e8m0fnu',
-    'float8_e8m0fnux2',
-    'float8_e8m0fnux4',
-    'float8_e8m0fnux8',
-    'float8_e8m0fnux16',
-    'float8_e8m0fnux32',
-    'float8_e8m0fnux64',
-    'float6_e2m3fn',
-    'float6_e2m3fnx2',
-    'float6_e2m3fnx4',
-    'float6_e2m3fnx8',
-    'float6_e2m3fnx16',
-    'float6_e2m3fnx32',
-    'float6_e2m3fnx64',
-    'float6_e3m2fn',
-    'float6_e3m2fnx2',
-    'float6_e3m2fnx4',
-    'float6_e3m2fnx8',
-    'float6_e3m2fnx16',
-    'float6_e3m2fnx32',
-    'float6_e3m2fnx64',
-    'float4_e2m1fn',
-    'float4_e2m1fnx2',
-    'float4_e2m1fnx4',
-    'float4_e2m1fnx8',
-    'float4_e2m1fnx16',
-    'float4_e2m1fnx32',
-    'float4_e2m1fnx64',
-    'bfloat16',
-}
-
-__all__ = list(_all_dtypes) + [
-    'dtype',
-    'AnyDType',
-    'get_tvm_dtype',
-]
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 872d30010..47a8c335e 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,5 +1,4 @@
 """The language interface for tl programs."""
-from __future__ import annotations
 
 from tvm.script.ir_builder.tir.frame import TIRFrame
 from tvm.ffi import register_object
@@ -15,7 +14,7 @@ class WarpSpecializeFrame(TIRFrame):
     """
 
 
-def WarpSpecialize(*warp_group_idx):
+def WarpSpecialize(*warp_group_idx) -> WarpSpecializeFrame:
     """Tools to construct a warp group frame.
 
     Parameters
diff --git a/tilelang/layout/__init__.py b/tilelang/layout/__init__.py
index ee513257f..ae50e86cb 100644
--- a/tilelang/layout/__init__.py
+++ b/tilelang/layout/__init__.py
@@ -12,5 +12,8 @@
     make_half_bank_swizzled_layout,  # noqa: F401
     make_quarter_bank_swizzled_layout,  # noqa: F401
     make_linear_layout,  # noqa: F401
+    make_gemm_fragment_8x8,  # noqa: F401
+    make_gemm_fragment_8x8_transposed,  # noqa: F401
+    make_fully_replicated_layout_fragment,  # noqa: F401
 )
-from .gemm_sp import make_metadata_layout  # noqa: F401
+from .gemm_sp import make_cutlass_metadata_layout  # noqa: F401
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index 06fc7a987..256a7d5ee 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,7 +1,6 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 import tvm
 import tvm_ffi
 from tvm.ir import Range
@@ -22,12 +21,7 @@ class Fragment(Layout):
     # Disable the linter warning about not calling super().__init__()
     # because this object is created via TVM's FFI constructor mechanism.
     # pylint: disable=super-init-not-called
-    def __init__(self,
-                 shape,
-                 forward_fn=None,
-                 forward_thread_fn=None,
-                 replicate=1,
-                 forward_index_fn=None):
+    def __init__(self, shape, forward_fn=None, forward_thread_fn=None, replicate=1, forward_index_fn=None):
         """
         Initialize the Fragment with iteration variables and optional thread replication.
 
@@ -121,10 +115,7 @@ def get_thread_size(self):
         """
         return _ffi_api.Fragment_thread_size(self)
 
-    def repeat(self,
-               repeats,
-               repeat_on_thread: bool = False,
-               lower_dim_first: bool = True) -> Fragment:
+    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> "Fragment":
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -144,7 +135,7 @@ def repeat(self,
         """
         return _ffi_api.Fragment_repeat(self, repeats, repeat_on_thread, lower_dim_first)
 
-    def replicate(self, replicate: int) -> Fragment:
+    def replicate(self, replicate: int) -> "Fragment":
         """
         Replicate the Fragment across a new thread dimension.
 
@@ -160,7 +151,7 @@ def replicate(self, replicate: int) -> Fragment:
         """
         return _ffi_api.Fragment_replicate(self, replicate)
 
-    def condense_rep_var(self) -> Fragment:
+    def condense_rep_var(self) -> "Fragment":
         """
         Condense or fold the replicate variable into the existing iteration space.
         This operation may be used to reduce dimensionality if the replicate variable
@@ -192,8 +183,7 @@ def map_forward_thread(self, indices: list[PrimExpr]) -> PrimExpr:
         # The thread dimension (IterVar) is accessed via the `thread` property
         forward_thread = self.thread
         # Construct an IndexMap to map the provided args into the final thread index
-        index_map = IndexMap(
-            initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
+        index_map = IndexMap(initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
         return index_map.map_indices(indices)
 
     def __repr__(self):
@@ -205,9 +195,10 @@ def __repr__(self):
         str
             A string showing the thread dimension and the index dimension.
         """
-        return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
+        return self._DebugOutput()
+        # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
-    def is_equal(self, other: Fragment) -> bool:
+    def is_equal(self, other: "Fragment") -> bool:
         """
         Check if the current fragment is equal to another fragment.
         """
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index 2fd58cd2e..7ae836bc8 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -1,7 +1,7 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
-
 import tvm
 import tilelang.language as T
 import warnings
@@ -18,7 +18,7 @@ def decompose_col_major(index_1d: int, basis: list[int]) -> list[int]:
     return res
 
 
-def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
+def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
     """Make a layout of metadata that is compatible with cutlass sm90 compression kernel. Note that layout atom is the same for smem and gmem.
 
     Args:
@@ -31,10 +31,20 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
         warnings.warn(f"block_k {block_k} is too large, set to 128 for {mma_dtype}.", stacklevel=2)
-    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8"]:
+    if mma_dtype not in [
+        T.float16,
+        T.bfloat16,
+        T.float32,
+        T.int8,
+        T.float8_e4m3,
+        T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
+        T.float8_e5m2,
+        T.float8_e5m2fnuz,
+    ]:
         raise NotImplementedError(f"Unsupported dtype: {mma_dtype}")
 
-    if buffer.dtype not in ["uint8", "int8"]:
+    if buffer.dtype not in [T.uint8, T.int8]:
         raise ValueError(f"metadata should be 8 bit, got {buffer.dtype}")
 
     bits_map = {
@@ -42,7 +52,11 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         "bfloat16": 16,
         "float32": 32,
         "int8": 8,
-        "float8": 8,
+        "float8_e4m3": 8,
+        "float8_e4m3fn": 8,
+        "float8_e4m3fnuz": 8,
+        "float8_e5m2": 8,
+        "float8_e5m2fnuz": 8,
     }
 
     # ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl#L108-L117
@@ -76,8 +90,8 @@ def gen_stride(shape_ik, order):
         shape_i, shape_k = shape_ik[:3], shape_ik[3:]
         stride_i, stride_k = stride_ik[:3], stride_ik[3:]
     elif bits_map[mma_dtype] == 8:
-        shape_i, shape_k = [64], [BlockK]
-        stride_i, stride_k = [BlockK], [1]
+        shape_i, shape_k = [64], [block_k // 8]
+        stride_i, stride_k = [block_k // 8], [1]
     else:
         raise NotImplementedError(f"Unknown mma type {mma_dtype}")
 
@@ -104,54 +118,44 @@ def transform(i: int, k: int) -> int:
     return T.Layout(shape, transform)
 
 
-def _make_metadata_layout_sm8x_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str):
+def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     """Make a layout of metadata that is compatible with cutlass sm8x compression kernel. Note that layout atom is the same for smem and gmem.
-
+        ref: https://github.com/pytorch/pytorch/blob/d0c24b392cbb7b213d22e42c52c6c2d1ac2da1bd/torch/sparse/_semi_structured_conversions.py#L5
     Args:
         buffer: metadata buffer shape, for sm80 it should be a 16bit type
     """
 
-    # ref: https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h#L651
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/layout/matrix.h#L405
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/warp/mma_sparse_tensor_op.h#L172
-
-    if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
+    if mma_dtype in [T.float16, T.bfloat16] and buffer.dtype not in [T.uint16, T.int16]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", T.int8, T.uint8] and buffer.dtype not in [T.uint32, T.int32]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
-    kInterleaved = 2
-    stride = buffer.shape[0] * kInterleaved
+    m, k = buffer.shape
+    group = 32 if buffer.dtype.bits == 16 else 16
+    interweave = 4 if buffer.dtype.bits == 16 else 2
 
     def ColumnMajorInterleaved(i: int, j: int) -> int:
-        column_major = j // kInterleaved
-        column_minor = j % kInterleaved
-        return column_major * stride + i * kInterleaved + column_minor
+        i = i // group * group + (i % 8) * interweave + (i % group) // 8
+        topright = (1 - (i % 2)) & (j % 2)
+        bottomleft = (i % 2) & (1 - (j % 2))
+        i += topright - bottomleft
+        j -= topright - bottomleft
+        offset = (j // 2) * m * 2 + i * 2 + (j % 2)
+        return offset // k, offset % k
 
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_metadata_layout(buffer: tvm.tir.Buffer,
-                         mma_dtype: str = "float16",
-                         backend: str = 'cutlass',
-                         arch: str | None = None,
-                         **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = T.float16, arch: str | None = None, **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
     compute_version = nvcc.parse_compute_version(arch)
 
     if compute_version >= (9, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm90_cutlass(
-                buffer=buffer, mma_dtype=mma_dtype, **extra_args)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm90(buffer=buffer, mma_dtype=mma_dtype, **extra_args)
     elif compute_version >= (8, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm8x_cutlass(buffer=buffer, mma_dtype=mma_dtype)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm8x(buffer=buffer, mma_dtype=mma_dtype)
     else:
         raise NotImplementedError(f"Unsupported architecture: {arch}")
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index 14db12223..fbd39e8de 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,7 +1,6 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 import tvm_ffi
 from tvm.ir import Node, Range
 from tvm.tir import IterVar, Var, PrimExpr, IndexMap
@@ -11,7 +10,6 @@
 # Register the Layout class as a TVM object under the name "tl.Layout"
 @tvm_ffi.register_object("tl.Layout")
 class Layout(Node):
-
     def __init__(self, shape, forward_fn):
         """
         Initialize a Layout object.
@@ -116,13 +114,13 @@ def map_forward_index(self, indices: list[PrimExpr]) -> PrimExpr:
         index_map = IndexMap(
             initial_indices=forward_vars,  # The original iteration variables
             final_indices=forward_indexes,  # The computed forward indices
-            inverse_index_map=None  # No inverse mapping provided at this stage
+            inverse_index_map=None,  # No inverse mapping provided at this stage
         )
 
         # Map the provided indices using the constructed index mapping
         return index_map.map_indices(indices)
 
-    def inverse(self) -> Layout:
+    def inverse(self) -> "Layout":
         """
         Compute the inverse of the current layout transformation.
 
@@ -133,7 +131,7 @@ def inverse(self) -> Layout:
         """
         return _ffi_api.Layout_inverse(self)
 
-    def is_equal(self, other: Layout) -> bool:
+    def is_equal(self, other: "Layout") -> bool:
         """
         Check if the current layout is equal to another layout.
 
@@ -145,4 +143,5 @@ def is_equal(self, other: Layout) -> bool:
         return _ffi_api.Layout_is_equal(self, other)
 
     def __repr__(self):
-        return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
+        return self._DebugOutput()
+        # return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index f63c954a3..359371755 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -1,15 +1,14 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
+
 import tvm
 from tvm.tir import Buffer, BufferLoad, BufferRegion
 from tilelang import _ffi_api
 
 
-def _get_buffer_info(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion
-) -> tuple[Buffer, list[int], str]:
+def _get_buffer_info(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[Buffer, list[int], str]:
     """
     Extract buffer, shape, and dtype from Buffer, BufferLoad, or BufferRegion.
 
@@ -25,12 +24,10 @@ def _get_buffer_info(
         buf = buffer_or_load_or_region.buffer
         return buf, buf.shape, buf.dtype
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
-def _get_stride_continuous(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
+def _get_stride_continuous(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
     """
     Get stride (last 2nd dimension) and continuous (last dimension) from Buffer, BufferLoad, or BufferRegion.
 
@@ -62,9 +59,7 @@ def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegi
 
 # Use a stable swizzled layout to ensure consistent memory access patterns.
 # Swizzling should be enabled or disabled based on whether TMA (Tensor Memory Access) is applied.
-def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                         k_major: bool = True,
-                         allow_pad: bool = True):
+def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, k_major: bool = True, allow_pad: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     return _ffi_api.make_swizzled_layout(
@@ -77,9 +72,7 @@ def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for Volta Intrinsics
-def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               is_a: bool = True,
-                               k_inner: bool = True):
+def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, is_a: bool = True, k_inner: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     return _ffi_api.make_volta_swizzled_layout(
         stride,
@@ -90,9 +83,7 @@ def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for WGMMA Intrinsics
-def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               continuity: int = None,
-                               k_major: bool = True):
+def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
@@ -107,9 +98,7 @@ def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for TCGEN05MMA Intrinsics
-def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                                    continuity: int = None,
-                                    k_major: bool = True):
+def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
@@ -195,21 +184,64 @@ def make_quarter_bank_swizzled_layout(*args):
     )
 
 
-def make_linear_layout(*args):
+def make_linear_layout(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion):
     """
+    Create a row-major linear layout for any dimension.
+
     Args:
-        args: buffer/BufferLoad/BufferRegion or (stride, continuous)
-    Examples:
-        make_linear_layout(buffer)
-        make_linear_layout(stride, continuous)
+        buffer_or_load_or_region: Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        Layout: A row-major linear layout
     """
-    if len(args) == 1:
-        stride, continuous = _get_stride_continuous(args[0])
-    elif len(args) == 2:
-        stride, continuous = args
-    else:
-        raise ValueError(f"Invalid arguments: {args}")
-    return _ffi_api.make_linear_layout(
-        stride,
-        continuous,
-    )
+    _, shape, _ = _get_buffer_info(buffer_or_load_or_region)
+    return _ffi_api.make_linear_layout(list(shape))
+
+
+def make_gemm_fragment_8x8():
+    """
+    Create a standard 8x8 GEMM fragment layout for ldmatrix/stmatrix.
+
+    This layout matches the warp-level matrix multiplication pattern used in tensor cores.
+
+    Returns:
+        Fragment: An 8x8 fragment layout
+    """
+    return _ffi_api.make_gemm_fragment_8x8()
+
+
+def make_gemm_fragment_8x8_transposed():
+    """
+    Create a transposed 8x8 GEMM fragment layout for ldmatrix/stmatrix.
+
+    This layout is the transposed version of make_gemm_fragment_8x8, useful for
+    different access patterns in matrix operations.
+
+    Returns:
+        Fragment: A transposed 8x8 fragment layout
+    """
+    return _ffi_api.make_gemm_fragment_8x8_transposed()
+
+
+def make_fully_replicated_layout_fragment(buffer: Buffer | BufferLoad | BufferRegion, threads: int):
+    """
+    Create a fully replicated layout for a fragment buffer.
+
+    A fully replicated fragment means all threads hold identical copies of the
+    entire buffer. This is useful for index buffers or masks that need to be
+    accessed uniformly across all threads.
+
+    Args:
+        buffer: Buffer, BufferLoad, or BufferRegion to get shape information
+        threads: Number of threads (replicate extent)
+
+    Returns:
+        Fragment: A fully replicated layout where each thread has a complete copy
+
+    Example:
+        >>> C_local = T.alloc_fragment((2,), T.float32)
+        >>> layout = make_fully_replicated_layout_fragment(C_local, 256)
+        >>> T.annotate_layout({C_local: layout})
+    """
+    _, shape, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_fully_replicated_layout_fragment(list(shape), threads)
diff --git a/tilelang/libinfo.py b/tilelang/libinfo.py
index 5af8c84f4..d82986b75 100644
--- a/tilelang/libinfo.py
+++ b/tilelang/libinfo.py
@@ -31,6 +31,5 @@ def find_lib_path(name: str, py_ext=False):
         if os.path.exists(lib_dll_path) and os.path.isfile(lib_dll_path):
             return lib_dll_path
     else:
-        message = (f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" +
-                   "\n".join(TL_LIBS))
+        message = f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" + "\n".join(TL_LIBS)
         raise RuntimeError(message)
diff --git a/tilelang/primitives/__init__.py b/tilelang/primitives/__init__.py
deleted file mode 100644
index 8eccc3e5c..000000000
--- a/tilelang/primitives/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-""" bootstrap the primitives module via tile language """
-
-from .gemm import gemm  # noqa: F401
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
deleted file mode 100644
index ee9436d15..000000000
--- a/tilelang/primitives/gemm/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import annotations
-
-from tvm import tir
-from tilelang.utils import is_local, is_fragment, is_shared
-from tilelang.primitives.gemm.base import GemmWarpPolicy
-from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,)
-
-
-def gemm(
-    A: tir.Buffer,
-    B: tir.Buffer,
-    C: tir.Buffer,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    block_row_warps: int | None = None,
-    block_col_warps: int | None = None,
-    warp_row_tiles: int | None = None,
-    warp_col_tiles: int | None = None,
-    chunk: int | None = None,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1,
-):
-    assert is_local(A) or is_fragment(A) or is_shared(A), (
-        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}")
-    assert is_local(B) or is_fragment(B) or is_shared(B), (
-        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}")
-    assert is_local(C) or is_fragment(C), (
-        f"Expected C to be a local, fragment, but got {C.scope()}")
-    # TODO(lei): Now we only support Nvidia GPUs
-    # Must enhance the design to implement runtime lowering
-    # for different targets (hip mfma for example)
-    return GemmPrimitiveMMA(
-        A=A,
-        B=B,
-        C=C,
-        transpose_A=transpose_A,
-        transpose_B=transpose_B,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-        policy=policy,
-        k_pack=k_pack,
-    ).invoke()
diff --git a/tilelang/primitives/gemm/gemm_mma.py b/tilelang/primitives/gemm/gemm_mma.py
deleted file mode 100644
index 11e16838c..000000000
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ /dev/null
@@ -1,262 +0,0 @@
-from dataclasses import dataclass
-from tvm import tir
-import tilelang.language as T
-from tilelang.utils import is_fragment
-from tilelang.primitives.gemm.base import GemmBaseParams
-from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
-
-
-# TODO(lei): Implement GEMM_SR, GEMM_RS, GEMM_RR
-@dataclass
-class GemmPrimitiveMMA(GemmBaseParams):
-    """
-    A GEMM (General Matrix Multiply) primitive that uses Tensor Core MMA (Matrix
-    Multiply and Accumulate) instructions. Inherits from GemmBaseParams which
-    provides basic parameters such as A, B, C buffers and transposition flags.
-    """
-
-    def gemm_rrr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RRR is not implemented yet")
-
-    def gemm_rsr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-
-        in_dtype = self.in_dtype
-        warp_cols = mma_emitter.warp_cols
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        a_is_fragment = is_fragment(A)
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_rsr(A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if a_is_fragment:
-                # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout({
-                    A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                })
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            # Make default swizzle layout for shared memory
-            # T.annotate_layout({
-            #     B_shared: make_mma_swizzle_layout(B_shared),
-            # })
-            for ki in T.serial(0, (block_K // micro_size_k)):
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-                # Perform Matrix Multiplication
-                mma_emitter.mma(
-                    A_local,
-                    B_local,
-                    C_local,
-                    ki,
-                )
-
-        return _gemm_rsr(A, B, C)
-
-    def gemm_srr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RSR is not implemented yet")
-
-    def gemm_ssr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        """
-        Perform a single-step reduction (SSR) GEMM using Tensor Core MMA
-        primitives. Loads fragments of A and B from shared memory, multiplies
-        them, and accumulates into C.
-
-        Parameters
-        ----------
-        A : tir.Buffer
-            The buffer for matrix A (in shared memory).
-        B : tir.Buffer
-            The buffer for matrix B (in shared memory).
-        C : tir.Buffer
-            The buffer for the accumulation results.
-        mma_emitter : TensorCoreIntrinEmitter
-            A helper object responsible for generating Tensor Core MMA
-            instructions (ldmatrix, mma, etc.).
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated IR expression (macro) representing the GEMM loop.
-        """
-
-        in_dtype = self.in_dtype
-        warp_rows = mma_emitter.warp_rows
-        warp_cols = mma_emitter.warp_cols
-        local_size_a = mma_emitter.local_size_a
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_ssr(A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            for ki in T.serial(0, (block_K // micro_size_k)):
-                # Load A into fragment
-                mma_emitter.ldmatrix_a(
-                    A_local,
-                    A_shared,
-                    ki,
-                )
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-
-                # Perform Matrix Multiplication
-                mma_emitter.mma(A_local, B_local, C_local)
-
-        return _gemm_ssr(A, B, C)
-
-    def invoke(self) -> tir.PrimExpr:
-        """
-        Entry point to generate a GEMM SSR (single-step reduction) with Tensor
-        Core instructions. Performs the following steps:
-            1. Infers block partition parameters if necessary.
-            2. Creates a `TensorCoreIntrinEmitter` with the correct data types
-               and dimensions.
-            3. Invokes the GEMM SSR function to generate the final IR expression.
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated GEMM IR expression.
-        """
-
-        # Infer block partition if necessary
-        current_frame = T.KernelLaunchFrame.Current()
-        threads = current_frame.get_num_threads()
-
-        self.infer_block_partition(threads)
-
-        A, B, C = self.A, self.B, self.C
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        chunk = self.chunk
-
-        # Check dtypes
-        A_dtype, B_dtype, C_dtype = A.dtype, B.dtype, C.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        in_dtype, accum_dtype = A_dtype, C_dtype
-
-        # Create the MMA emitter
-        mma_emitter = TensorCoreIntrinEmitter(
-            a_dtype=in_dtype,
-            b_dtype=in_dtype,
-            accum_dtype=accum_dtype,
-            a_transposed=transpose_A,
-            b_transposed=transpose_B,
-            block_row_warps=block_row_warps,
-            block_col_warps=block_col_warps,
-            warp_row_tiles=warp_row_tiles,
-            warp_col_tiles=warp_col_tiles,
-            chunk=chunk,
-        )
-        a_is_fragment = is_fragment(A)
-        b_is_fragment = is_fragment(B)
-        if a_is_fragment and b_is_fragment:
-            return self.gemm_rrr(A, B, C, mma_emitter)
-        if a_is_fragment:
-            return self.gemm_rsr(A, B, C, mma_emitter)
-        if b_is_fragment:
-            return self.gemm_srr(A, B, C, mma_emitter)
-        return self.gemm_ssr(A, B, C, mma_emitter)
-
-    @property
-    def in_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The input data type for A and B. Assumes both have the same dtype.
-
-        Raises
-        ------
-        AssertionError
-            If A and B do not share the same dtype.
-        """
-        A_dtype, B_dtype = self.A.dtype, self.B.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        return self.A.dtype
-
-    @property
-    def accum_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The accumulation data type for C.
-        """
-        return self.C.dtype
-
-
-__all__ = ["GemmPrimitiveMMA"]
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index c681ee976..1ff75c7d5 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,21 +1,20 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
 
+from __future__ import annotations
 from typing import Callable, Any, Literal
 from functools import partial
 import torch
-from contextlib import suppress
 from dataclasses import dataclass
-import tvm
 from tilelang.utils.tensor import (
     get_tensor_supply,
     TensorSupplyType,
     torch_assert_close,
-    adapt_torch2tvm,
+    is_float8_dtype,
 )
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
 from tilelang.profiler.bench import do_bench
+from tvm import tir
 
 
 @dataclass
@@ -46,8 +45,7 @@ def _legalize_result_idx(self, result_idx: list[int] | None = None) -> list[int]
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
+                raise ValueError(f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
@@ -60,13 +58,41 @@ def with_default_adapter(self, adapter: BaseKernelAdapter) -> Profiler:
         self.adapter = adapter
         return self
 
-    def _get_inputs(self, with_output=False):
+    def _get_inputs(self, with_output=False, dynamic_symbolic_constraints: dict[str, int] | None = None):
         ins = []
         for i in range(len(self.params)):
             if with_output or i not in self.result_idx:
-                ins.append(self.supply(self.params[i]))
+                param = self.params[i]
+                if dynamic_symbolic_constraints:
+                    param = self._substitute_dynamic_symbols(param, dynamic_symbolic_constraints)
+                ins.append(self.supply(param))
         return ins
 
+    def _substitute_dynamic_symbols(self, param: KernelParam, constraints: dict[str, int]) -> KernelParam:
+        """Substitute dynamic symbolic variables in param shape with concrete values.
+
+        Args:
+            param: The kernel parameter with potentially dynamic shape
+            constraints: A dict mapping symbolic variable names to concrete int values
+
+        Returns:
+            A new KernelParam with substituted shape
+        """
+        new_shape = []
+        for dim in param.shape:
+            if isinstance(dim, tir.Var):
+                var_name = dim.name
+                if var_name in constraints:
+                    new_shape.append(constraints[var_name])
+                else:
+                    raise ValueError(
+                        f"Dynamic symbolic variable '{var_name}' not found in constraints. "
+                        f"Available constraints: {list(constraints.keys())}"
+                    )
+            else:
+                new_shape.append(dim)
+        return KernelParam(dtype=param.dtype, shape=new_shape)
+
     def _get_params(self, with_output=False):
         params = []
         for i in range(len(self.params)):
@@ -114,8 +140,7 @@ def assert_allclose(
         ref_tensors = ins + ref_outs
         lib_tensors = ins + lib_outs
 
-        assert len(lib_tensors) == len(
-            ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
+        assert len(lib_tensors) == len(ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
         # torch.set_printoptions(edgeitems=torch.inf)
         for lhs, rhs in zip(lib_tensors, ref_tensors):
             # close_mask = torch.isclose(lhs, rhs, rtol=rtol, atol=atol)
@@ -127,17 +152,9 @@ def assert_allclose(
             if lhs is not None and rhs is not None:
                 # in case of numsplit template, the ref output may be None
                 # which means the value is invalid, so we skip the comparison
-                def is_float8(tensor: torch.Tensor) -> bool:
-                    return tensor.dtype in {
-                        torch.float8_e5m2,
-                        torch.float8_e5m2fnuz,
-                        torch.float8_e4m3fn,
-                        torch.float8_e4m3fnuz,
-                    }
-
                 torch_assert_close(
-                    lhs if not is_float8(lhs) else lhs.to(torch.float32),
-                    rhs if not is_float8(rhs) else rhs.to(torch.float32),
+                    lhs if not is_float8_dtype(lhs.dtype) else lhs.to(torch.float32),
+                    rhs if not is_float8_dtype(rhs.dtype) else rhs.to(torch.float32),
                     rtol=rtol,
                     atol=atol,
                     max_mismatched_ratio=max_mismatched_ratio,
@@ -201,32 +218,18 @@ def run_once(self, func: Callable | None = None):
             func = self.__call__
         return func(*ins)
 
-    def determine_profiler(self, func: Callable | None = None):
-        """Determines which profiler backend to use based on function type.
-
-        Args:
-            func: Function to be profiled
-            profiler: Explicitly specified profiler type or "auto" for automatic detection
-
-        Returns:
-            str: The determined profiler type ("torch" or "tvm")
-        """
-        if isinstance(func, tvm.runtime.Module):
-            return "tvm"
-        else:
-            return "torch"
-
     def do_bench(
         self,
         func: Callable | None = None,
         warmup: int = 25,
         rep: int = 100,
-        n_warmup: int = 1,
-        n_repeat: int = 1,
+        n_warmup: int = 0,
+        n_repeat: int = 0,
         input_tensors: list[torch.Tensor] = None,
-        backend: Literal["event", "cupti"] = "event",
+        backend: Literal["event", "cupti", "cudagraph"] = "event",
         quantiles: list[float] | None = None,
         return_mode: Literal["min", "max", "mean", "median"] = "mean",
+        dynamic_symbolic_constraints: dict[str, int] | None = None,
     ) -> float:
         """Benchmarks the execution time of a given function.
 
@@ -236,50 +239,35 @@ def do_bench(
             rep: Number of repetitions for timing
             n_warmup: Number of warmup iterations
             n_repeat: Number of timing iterations
-            profiler: Which profiling backend to use
+            backend: Which profiling backend to use - "event", "cupti", or "cudagraph"
             input_tensors: Optional pre-generated input tensors
+            dynamic_symbolic_constraints: Optional dict mapping dynamic symbolic variable
+                names to concrete int values. Use this when benchmarking kernels with
+                dynamic shapes, e.g., {"m": 2048, "n": 1024}
 
         Returns:
             float: Average execution time in milliseconds
         """
-        profiler = self.determine_profiler(func)
-        if profiler == "torch":
-            if func is None:
-                assert self.adapter is not None, "benchmarking function should be provided"
-                func = self.adapter
-            ins = self._get_inputs() if input_tensors is None else input_tensors
-            bench_func = partial(func, *ins)
-            return do_bench(
-                bench_func,
-                warmup=warmup,
-                rep=rep,
-                _n_warmup=n_warmup,
-                _n_repeat=n_repeat,
-                quantiles=quantiles,
-                backend=backend,
-                return_mode=return_mode,
-            )
-        elif profiler == "tvm":
-            assert func is not None, "func should not be None"
-            assert isinstance(
-                func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
-
-            ins = (self._get_inputs(with_output=True) if input_tensors is None else input_tensors)
-            target = "cuda"
-
-            with suppress(Exception):
-                target = self.mod.imported_modules[0].type_key
-
-            assert target in ["cuda", "hip"], f"Unknown target: {target}"
-
-            device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
-            time_evaluator = self.mod.time_evaluator(
-                self.mod.entry_name, device, number=rep, repeat=n_repeat)
-            tvm_inputs = [adapt_torch2tvm(inp) for inp in ins]
-            # Transform Latency to ms
-            return time_evaluator(*tvm_inputs).mean * 1e3
+        if func is None:
+            assert self.adapter is not None, "benchmarking function should be provided"
+            func = self.adapter
+        if input_tensors is not None:
+            ins = input_tensors
+        elif dynamic_symbolic_constraints is not None:
+            ins = self._get_inputs(dynamic_symbolic_constraints=dynamic_symbolic_constraints)
         else:
-            raise ValueError(f"Unknown profiler: {profiler}")
+            ins = self._get_inputs()
+        bench_func = partial(func, *ins)
+        return do_bench(
+            bench_func,
+            warmup=warmup,
+            rep=rep,
+            _n_warmup=n_warmup,
+            _n_repeat=n_repeat,
+            quantiles=quantiles,
+            backend=backend,
+            return_mode=return_mode,
+        )
 
     @property
     def func(self):
diff --git a/tilelang/profiler/bench.py b/tilelang/profiler/bench.py
index a851ceb3d..5ef832134 100644
--- a/tilelang/profiler/bench.py
+++ b/tilelang/profiler/bench.py
@@ -1,4 +1,5 @@
 """Profiler and benchmarking utilities for PyTorch functions."""
+
 from __future__ import annotations
 
 import os
@@ -16,8 +17,8 @@ class suppress_stdout_stderr:
 
     def __enter__(self):
         # Open null device files
-        self.outnull_file = open(os.devnull, 'w')
-        self.errnull_file = open(os.devnull, 'w')
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
 
         # Save original file descriptors
         self.old_stdout_fileno_undup = sys.stdout.fileno()
@@ -56,7 +57,7 @@ def __exit__(self, *_):
 
 
 IS_CUDA = torch.cuda.is_available()
-device = 'cuda:0' if IS_CUDA else 'mps:0'
+device = "cuda:0" if IS_CUDA else "mps:0"
 Event = torch.cuda.Event if IS_CUDA else torch.mps.Event
 
 
@@ -68,7 +69,7 @@ def do_bench(
     _n_repeat: int = 0,
     quantiles: list[float] | None = None,
     fast_flush: bool = True,
-    backend: Literal["event", "cupti"] = "event",
+    backend: Literal["event", "cupti", "cudagraph"] = "event",
     return_mode: Literal["min", "max", "mean", "median"] = "mean",
 ) -> float | list[float]:
     """Benchmark the runtime of a PyTorch function with L2 cache management.
@@ -76,7 +77,7 @@ def do_bench(
     This function provides accurate GPU kernel timing by:
     - Clearing L2 cache between runs for consistent measurements
     - Auto-calculating warmup and repeat counts based on kernel runtime
-    - Supporting multiple profiling backends (CUDA events or CUPTI)
+    - Supporting multiple profiling backends (CUDA events, CUPTI, or CUDA graph replay)
     - Offering flexible result aggregation (mean/median/min/max/quantiles)
 
     Args:
@@ -87,14 +88,13 @@ def do_bench(
         _n_repeat: Manual override for benchmark iterations (default: 0 = auto)
         quantiles: Performance percentiles to compute (e.g., [0.5, 0.95])
         fast_flush: Use faster L2 cache flush with int32 vs int8 (default: True)
-        backend: Profiler backend - "event" (CUDA events) or "cupti" (default: "event")
+        backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph" (default: "event")
         return_mode: Result aggregation method - "mean", "median", "min", or "max"
 
     Returns:
         Runtime in milliseconds (float) or list of quantile values if quantiles specified
     """
-    assert return_mode in ["min", "max", "mean", "median"], \
-        f"Invalid return_mode: {return_mode}"
+    assert return_mode in ["min", "max", "mean", "median"], f"Invalid return_mode: {return_mode}"
 
     # Initial function call and synchronization
     fn()
@@ -131,6 +131,8 @@ def do_bench(
         return _bench_with_cuda_events(fn, cache, n_repeat, quantiles, return_mode)
     elif backend == "cupti":
         return _bench_with_cupti(fn, cache, n_repeat)
+    elif backend == "cudagraph":
+        return _bench_with_cudagraph(fn, cache, n_repeat, quantiles, return_mode)
     else:
         raise ValueError(f"Unknown profiler backend: {backend}")
 
@@ -202,3 +204,54 @@ def _bench_with_cupti(
 
     kernel_time_us = (total_cuda_time - excluded_time) / n_repeat
     return kernel_time_us * 1e-3  # Convert microseconds to milliseconds
+
+
+def _bench_with_cudagraph(
+    fn: Callable,
+    cache: torch.Tensor,
+    n_repeat: int,
+    quantiles: list[float] | None,
+    return_mode: str,
+) -> float | list[float]:
+    """Benchmark using CUDA graph for minimal launch overhead.
+
+    This implementation follows triton.testing.do_bench_cudagraph.
+    It captures the kernel execution in a CUDA graph and replays it multiple
+    times to minimize host overhead and provide accurate timing measurements.
+
+    Note: Cache flushing is done before graph replay, not within the graph,
+    since CUDA graphs require fixed execution patterns.
+    """
+    n_retries = 10
+    with torch.cuda.stream(torch.cuda.Stream()):
+        # Construct a CUDA graph with `n_repeat` unrolled function calls to minimize host overhead.
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            for _ in range(n_repeat):
+                fn()
+
+        torch.cuda.synchronize()
+
+        # Measure time by replaying the graph multiple times.
+        # Clear cache before each replay for consistent measurements.
+        start_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_retries)]
+        end_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_retries)]
+        for i in range(n_retries):
+            cache.zero_()  # Clear L2 cache before replay
+            start_events[i].record()
+            g.replay()
+            end_events[i].record()
+
+        torch.cuda.synchronize()
+        times = torch.tensor(
+            [s.elapsed_time(e) / n_repeat for s, e in zip(start_events, end_events)],
+            dtype=torch.float,
+        )
+
+        # Return quantiles if requested
+        if quantiles is not None:
+            quantile_values = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()
+            return quantile_values[0] if len(quantile_values) == 1 else quantile_values
+
+        # Return aggregated result
+        return getattr(torch, return_mode)(times).item()
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index 47d91f056..6f1f457d1 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-from __future__ import annotations
 from typing import Literal
+from tilelang import language as T
 
 decode_i4_to_f16 = """
 template <typename T1, typename T2, bool isSigned = false>
@@ -1089,10 +1089,10 @@
 
 
 def get_lop3_intrin_group(
-    out_dtype: Literal["float16", "int8", "int4"],
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.int8, T.int4],
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8"] = "int8",
+    storage_dtype: Literal[T.int32, T.int8] = T.int8,
     with_scaling: bool = False,
     with_zeros: bool = False,
     zeros_mode: Literal["original", "rescale", "quantized"] = "original",
@@ -1105,10 +1105,10 @@ def get_lop3_intrin_group(
 
     Parameters
     ----------
-    in_dtype : Literal["int8"]
+    in_dtype : Literal[T.int8]
         The data type of the input. It should be "int8".
 
-    out_dtype : Literal["float16", "int8", "int4"]
+    out_dtype : Literal[T.float16, T.int8, T.int4]
         The data type of the output. It can be either "float16" or "int8" or "int4".
 
     storage_nbit : int, optional
@@ -1131,21 +1131,17 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in [
-        "float16", "int8", "int4"
-    ], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.int8, T.int4], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
 
-    dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
+    dtype_mapping = {T.float16: "f16", T.int4: "i4", T.int8: "i8", T.int32: "i32"}
     target_dtype = dtype_mapping[out_dtype]
 
-    if source_format not in ["int", "uint"]:
-        raise ValueError(
-            f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
-    if with_zeros and source_format == "int":
+    if source_format not in [T.int, T.uint]:
+        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}, {type(source_format)}.")
+    if with_zeros and source_format == T.int:
         raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
 
-    source_symbol = "i" if source_format == "int" else "u"
-
     import_c_map = {
         "i4_to_f16": decode_i4_to_f16,
         "i2_to_f16": decode_i2_to_f16,
@@ -1180,15 +1176,15 @@ def get_lop3_intrin_group(
     if is_ladder_stage3:
         key += "_offset"
 
-    if out_dtype == "float16":
+    if out_dtype == T.float16:
         d4f = "f16"
-    elif out_dtype == "int8":
+    elif out_dtype == T.int8:
         d4f = "i8s"
-    elif out_dtype == "int4":
+    elif out_dtype == T.int4:
         d4f = "i4s"
     else:
         raise ValueError(f"Unsupported target dtype: {target_dtype}")
-    source_symbol = "u" if source_format == "uint" else "s"
+    source_symbol = "u" if source_format == T.uint else "s"
     func_name = f"decode_i{source_bit}{source_symbol}_to_{d4f}"
     if with_scaling:
         func_name += "_scale"
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 0425c549d..dd7100a62 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,5 +1,5 @@
-from __future__ import annotations
 from typing import Literal
+from tilelang import language as T
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -50,10 +50,10 @@
 
 
 def get_mxfp_intrin_group(
-    out_dtype: Literal["float16", "bfloat16"] = "bfloat16",
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8", "uint8"] = "uint8",
+    storage_dtype: Literal[T.int32, T.int8, T.uint8] = T.uint8,
     use_twiddling: bool = False,
 ) -> dict[str, str]:
     """
@@ -66,10 +66,10 @@ def get_mxfp_intrin_group(
     `_twiddling`).
 
     Parameters:
-        out_dtype: Target floating-point type for decoded values; either "float16" or "bfloat16".
+        out_dtype: Target floating-point type for decoded values; either T.float16 or T.bfloat16.
         source_format: Integer source representation; "int" or "uint".
         source_bit: Bit width of the packed source format (e.g., 4).
-        storage_dtype: Underlying storage integer dtype (one of "int32", "int8", "uint8").
+        storage_dtype: Underlying storage integer dtype (one of T.int32, T.int8, T.uint8).
         use_twiddling: When True, select the twiddling variant of the decoding intrinsic.
 
     Returns:
@@ -81,15 +81,12 @@ def get_mxfp_intrin_group(
         AssertionError: if out_dtype, source_format, or storage_dtype are not supported.
         KeyError: if the constructed key does not match any available C source implementation.
     """
-    assert out_dtype in ["float16", "bfloat16"
-                        ], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
-    assert source_format in ["int", "uint"
-                            ], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
-    assert storage_dtype in [
-        "int32", "int8", "uint8"
-    ], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.bfloat16], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
+    assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
+    assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    dtype_map = {"float16": "f16", "bfloat16": "bf16"}
+    dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     key = f"fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         key += "_twiddling"
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index db9d2349d..74a545f25 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -16,12 +16,12 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
+
 # The code below is mostly copied from mlc.ai quantization.py in mlc-llm.
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable
 """TIR computation utilities for quantization."""
 
+from tilelang import language as T
 from tilelang import tvm as tvm
 from tvm import tir
 
@@ -36,7 +36,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         a bfloat16 constructed from the unpacked sign, a scaled exponent, and the 1-bit mantissa.
 
         Behavior:
-        - Validates `nbit == 4`, `dtype == "bfloat16"`, and `val.dtype == "uint8"` (AssertionError if violated).
+        - Validates `nbit == 4`, `dtype == T.bfloat16`, and `val.dtype == T.uint8` (AssertionError if violated).
         - Extracts the 4-bit field at position `pos` (fields are packed consecutively in `val`).
         - Interprets the 4-bit field as: sign = bit3, exponent = bits1-2, mantissa = bit0.
         - Converts the 2-bit exponent to bf16 exponent space by adding a bias of 126, adds `scale` to that exponent,
@@ -49,27 +49,27 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         - val: uint8 expression containing packed fields.
         - pos: index of the field within `val` (0-based); used to compute the bit shift.
         - scale: exponent-scale to add to the converted exponent (treated as an unsigned integer expression).
-        - dtype: must be "bfloat16".
+        - dtype: must be T.bfloat16.
 
         Returns:
         - A tir.PrimExpr of dtype "bfloat16" representing the decoded and scaled value.
         """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(T.bfloat16,
+                               ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16))
+                                | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16))
     return val_bf16
 
 def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_even: bool = True):
@@ -88,29 +88,29 @@ def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_eve
     Returns:
         tir.PrimExpr: A uint32 PrimExpr containing the packed bfloat16 representations (v0 low 16 bits, v1 high 16 bits).
     """
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     res = []
     for data in [v0, v1]:
-        u32_val = tir.reinterpret("uint32", data)
+        u32_val = tir.reinterpret(T.uint32, data)
         if round_to_even:
-            rounding_bias = ((u32_val >> tir.const(16, "uint32"))
-                             & tir.const(1, "uint32")) + tir.const(0x7FFF, "uint32")
+            rounding_bias = ((u32_val >> tir.const(16, T.uint32))
+                             & tir.const(1, T.uint32)) + tir.const(0x7FFF, T.uint32)
             u32_val += rounding_bias
-        res.append((u32_val >> tir.const(16, "uint32")) & mask)
-    return res[0] | (res[1] << tir.const(16, "uint32"))
+        res.append((u32_val >> tir.const(16, T.uint32)) & mask)
+    return res[0] | (res[1] << tir.const(16, T.uint32))
 
 
 def _tir_u32_to_bf16x2_to_f32x2(x: tir.PrimExpr):
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     x0 = x & mask
     x1 = (x >> 16) & mask
-    return (tir.reinterpret("float32", x << tir.const(16, "uint32")) for x in [x0, x1])
+    return (tir.reinterpret(T.float32, x << tir.const(16, T.uint32)) for x in [x0, x1])
 
 
 def _tir_u32_to_int_to_float(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
-    assert val.dtype == "uint32"
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    return tir.Cast(dtype, (val >> (pos * nbit).astype("uint32")) & mask)
+    assert val.dtype == T.uint32
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    return tir.Cast(dtype, (val >> (pos * nbit).astype(T.uint32)) & mask)
 
 
 def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
@@ -119,7 +119,7 @@ def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1)) - 1
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -130,74 +130,74 @@ def _tir_packed_int_to_int_to_float(storage_nbit: int):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
 
 def _tir_f32_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float32"
-    val_u32 = tir.reinterpret("uint32", val)
+    assert val.dtype == T.float32
+    val_u32 = tir.reinterpret(T.uint32, val)
     # e_f32 >  120 -> e_f4 = min(e_f32 - 120 + M_h, 7)
     # e_f32 == 120 -> e_f4 = 1
     # e_f32 < 120 -> e_f4 = 0
-    m_h = (val_u32 >> tir.const(22, "uint32")) & tir.const(1, "uint32")
-    e_f32 = (val_u32 >> tir.const(23, "uint32")) & tir.const(255, "uint32")
-    s = (val_u32 >> tir.const(31, "uint32"))
+    m_h = (val_u32 >> tir.const(22, T.uint32)) & tir.const(1, T.uint32)
+    e_f32 = (val_u32 >> tir.const(23, T.uint32)) & tir.const(255, T.uint32)
+    s = (val_u32 >> tir.const(31, T.uint32))
     e_f4 = tir.Select(
-        e_f32 > tir.const(120, "uint32"),
-        tir.Min(e_f32 - tir.const(120, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f32 == tir.const(120, "uint32"), tir.const(1, "uint32"),
-                   tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f32 > tir.const(120, T.uint32),
+        tir.Min(e_f32 - tir.const(120, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f32 == tir.const(120, T.uint32), tir.const(1, T.uint32),
+                   tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_f16_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float16"
-    val_u32 = tir.Cast("uint32", tir.reinterpret("uint16", val))
-    m_h = (val_u32 >> tir.const(9, "uint32")) & tir.const(1, "uint32")
-    e_f16 = (val_u32 >> tir.const(10, "uint32")) & tir.const(31, "uint32")
-    s = (val_u32 >> tir.const(15, "uint32"))
+    assert val.dtype == T.float16
+    val_u32 = tir.Cast(T.uint32, tir.reinterpret(T.uint16, val))
+    m_h = (val_u32 >> tir.const(9, T.uint32)) & tir.const(1, T.uint32)
+    e_f16 = (val_u32 >> tir.const(10, T.uint32)) & tir.const(31, T.uint32)
+    s = (val_u32 >> tir.const(15, T.uint32))
     e_f4 = tir.Select(
-        e_f16 > tir.const(8, "uint32"),
-        tir.Min(e_f16 - tir.const(8, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f16 == tir.const(8, "uint32"), tir.const(1, "uint32"), tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f16 > tir.const(8, T.uint32),
+        tir.Min(e_f16 - tir.const(8, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f16 == tir.const(8, T.uint32), tir.const(1, T.uint32), tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_u32_to_f4_to_f32(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float32"
-    assert val.dtype == "uint32"
+    assert dtype == T.float32
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f32 = 0
     # e_f4 != 0 -> e_f32 = e_f4 + 120 = e_f4 | (1111000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    f4 = (val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & mask
-    s = f4 >> tir.const(3, "uint32")
-    e_f4 = f4 & tir.const(7, "uint32")
-    e_f32 = e_f4 | tir.const(120, "uint32")
-    val_f32 = tir.reinterpret("float32",
-                              (e_f32 | (s << tir.const(8, "uint32"))) << tir.const(23, "uint32"))
-    return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float32"), val_f32)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    f4 = (val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & mask
+    s = f4 >> tir.const(3, T.uint32)
+    e_f4 = f4 & tir.const(7, T.uint32)
+    e_f32 = e_f4 | tir.const(120, T.uint32)
+    val_f32 = tir.reinterpret(T.float32,
+                              (e_f32 | (s << tir.const(8, T.uint32))) << tir.const(23, T.uint32))
+    return tir.Select(e_f4 == tir.const(0, T.uint32), tir.const(0, T.float32), val_f32)
 
 
 def _tir_packed_to_fp4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint32"
+    assert dtype == T.float16
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + 8 = e_f4 | (1000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = f4 & tir.const(7, "uint16")
-    e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
-    return tir.Select(e_f4 == tir.const(0, "uint16"), tir.const(0, "float16"), val_f16)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = f4 & tir.const(7, T.uint16)
+    e_f16 = e_f4 | tir.const(8, T.uint16)
+    val_f16 = tir.reinterpret(T.float16,
+                              ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16)).astype(T.uint16))
+    return tir.Select(e_f4 == tir.const(0, T.uint16), tir.const(0, T.float16), val_f16)
 
 def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
     storage_dtype = storage_type + str(storage_nbit)
@@ -210,37 +210,37 @@ def f_convert(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: st
         s = f4 >> tir.const(3, storage_dtype)
         e_f4 = f4 & tir.const(7, storage_dtype)
         e_f16 = e_f4 | tir.const(8, storage_dtype)
-        val_f16 = tir.reinterpret("float16",
-                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype("uint16"))
-        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, "float16"), val_f16)
+        val_f16 = tir.reinterpret(T.float16,
+                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype(T.uint16))
+        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, T.float16), val_f16)
 
     return f_convert
 
 def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"),
-                        tir.const(0x4000, "uint16"))
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | prefix
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    prefix = tir.Select(e4 == tir.const(0, T.uint16), tir.const(0x2000, T.uint16),
+                        tir.const(0x4000, T.uint16))
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | prefix
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | (e4 << tir.const(8, "uint16")) | (e4 << tir.const(7, "uint16"))
-    e_f16 = e_f16 ^ tir.const(0x2000, "uint16")
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | (e4 << tir.const(8, T.uint16)) | (e4 << tir.const(7, T.uint16))
+    e_f16 = e_f16 ^ tir.const(0x2000, T.uint16)
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    return tir.reinterpret("float8_e5m2", val).astype("float16")
+    assert dtype == T.float16
+    return tir.reinterpret("float8_e5m2", val).astype(T.float16)
 
 
 def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
@@ -249,7 +249,7 @@ def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1))
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -283,10 +283,10 @@ def _tir_packed_int_to_int_convert(storage_type="uint", storage_nbit=8):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
diff --git a/tilelang/quantize/utils.py b/tilelang/quantize/utils.py
index 2447ca167..2d092a0ba 100644
--- a/tilelang/quantize/utils.py
+++ b/tilelang/quantize/utils.py
@@ -1,6 +1,7 @@
 def gen_quant4(k, n, groupsize=-1):
     import torch
     import torch.nn as nn
+
     maxq = 2**4
     w = torch.randn((k, n), dtype=torch.half, device="cpu")
 
@@ -48,6 +49,7 @@ def reshape(w):
 
 def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     import torch
+
     if storage_dtype is None:
         storage_dtype = torch.int8
     elems_per_byte = 8 // source_bits
@@ -56,11 +58,11 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     int8_weight = torch.zeros(
         (*lowprecision_weight.shape[:-1], lowprecision_weight.shape[-1] // elems_per_byte),
         dtype=torch.int8,
-        device=lowprecision_weight.device)
+        device=lowprecision_weight.device,
+    )
     for j in range(lowprecision_weight.shape[-1] // elems_per_byte):
         for k in range(elems_per_byte):
-            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] <<
-                                    (source_bits * k)).to(torch.int8)
+            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] << (source_bits * k)).to(torch.int8)
 
     return int8_weight.to(storage_dtype)
 
@@ -82,6 +84,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         interleave_weight(qweight, 4, "float16")
     """
     import torch
+
     assert target_dtype in ["float16", "int8"]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(torch.int32)
diff --git a/tilelang/testing/__init__.py b/tilelang/testing/__init__.py
index 6a2031492..1c466eeb3 100644
--- a/tilelang/testing/__init__.py
+++ b/tilelang/testing/__init__.py
@@ -5,20 +5,22 @@
 import torch
 import numpy as np
 from tilelang.contrib import nvcc
-from tvm.testing.utils import (requires_cuda, requires_package, requires_llvm, requires_metal,
-                               requires_rocm, _compose)
+from tvm.testing.utils import requires_cuda, requires_package, requires_llvm, requires_metal, requires_rocm, _compose
 
 from tilelang.utils.tensor import torch_assert_close as torch_assert_close
+from .perf_regression import process_func, regression
 
 __all__ = [
-    'requires_package',
-    'requires_cuda',
-    'requires_metal',
-    'requires_rocm',
-    'requires_llvm',
-    'main',
-    'requires_cuda_compute_version',
-] + [f'requires_cuda_compute_version_{op}' for op in ('ge', 'gt', 'le', 'lt', 'eq')]
+    "requires_package",
+    "requires_cuda",
+    "requires_metal",
+    "requires_rocm",
+    "requires_llvm",
+    "main",
+    "requires_cuda_compute_version",
+    "process_func",
+    "regression",
+] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
 
 
 # pytest.main() wrapper to allow running single test file
diff --git a/tilelang/testing/perf_regression.py b/tilelang/testing/perf_regression.py
new file mode 100644
index 000000000..d1518fb2d
--- /dev/null
+++ b/tilelang/testing/perf_regression.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import inspect
+import json
+import logging
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Callable
+from collections.abc import Sequence
+import warnings
+
+
+@dataclass(frozen=True)
+class PerfResult:
+    name: str
+    latency: float
+
+
+_RESULTS: list[PerfResult] = []
+
+_MAX_RETRY_NUM = 5
+
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def _results_to_jsonable() -> list[dict[str, float | str]]:
+    return [{"name": r.name, "latency": r.latency} for r in _RESULTS]
+
+
+def _emit_results() -> None:
+    """Emit results for parent collectors.
+
+    Default output remains the historical text format. Set
+    `TL_PERF_REGRESSION_FORMAT=json` to emit a single JSON marker line which is
+    robust against extra prints from benchmark code.
+    """
+    fmt = os.environ.get("TL_PERF_REGRESSION_FORMAT", "text").strip().lower()
+    if fmt == "json":
+        print(_RESULTS_JSON_PREFIX + json.dumps(_results_to_jsonable(), separators=(",", ":")))
+        return
+    # Fallback (human-readable): one result per line.
+    for r in _RESULTS:
+        print(f"{r.name}: {r.latency}")
+
+
+def _reset_results() -> None:
+    _RESULTS.clear()
+
+
+def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> None:
+    """Execute a single perf function and record its latency.
+
+    `func` is expected to return a positive latency scalar (seconds or ms; we
+    treat it as an opaque number, only ratios matter for regression).
+    """
+    result_name = getattr(func, "__module__", "<unknown>") if name is None else name
+    if result_name.startswith("regression_"):
+        result_name = result_name[len("regression_") :]
+    latency = float(func(**kwargs))
+    _iter = 0
+    while latency <= 0.0 and _iter < _MAX_RETRY_NUM:
+        latency = float(func(**kwargs))
+        _iter += 1
+    if latency <= 0.0:
+        warnings.warn(f"{result_name} has latency {latency} <= 0. Please verify the profiling results.", RuntimeWarning, 1)
+        return
+    _RESULTS.append(PerfResult(name=result_name, latency=latency))
+
+
+def regression(prefixes: Sequence[str] = ("regression_",), verbose: bool = True) -> None:
+    """Run entrypoints in the caller module and print a markdown table.
+
+    This is invoked by many example scripts.
+    """
+
+    caller_globals = inspect.currentframe().f_back.f_globals  # type: ignore[union-attr]
+
+    _reset_results()
+    functions: list[tuple[str, Callable[[], Any]]] = []
+    for k, v in list(caller_globals.items()):
+        if not callable(v):
+            continue
+        if any(k.startswith(p) for p in prefixes):
+            functions.append((k, v))
+
+    sorted_functions = sorted(functions, key=lambda kv: kv[0])
+    total = len(sorted_functions)
+
+    for idx, (name, fn) in enumerate(sorted_functions, 1):
+        if verbose:
+            # Strip 'regression_' prefix for cleaner display
+            display_name = name[len("regression_") :] if name.startswith("regression_") else name
+            print(f"  ├─ [{idx}/{total}] {display_name}", end="", flush=True)
+        start_time = time.perf_counter()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # Suppress logging warnings during benchmark execution
+            prev_level = logging.root.level
+            logging.disable(logging.WARNING)
+            try:
+                fn()
+            finally:
+                logging.disable(logging.NOTSET)
+                logging.root.setLevel(prev_level)
+        elapsed = time.perf_counter() - start_time
+        if verbose:
+            print(f" ({elapsed:.2f}s)", flush=True)
+
+    _emit_results()
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index 5656494fe..6e7798a05 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1 +1,3 @@
+from .base import GemmWarpPolicy  # noqa: F401
 from .gemm import GemmPy  # noqa: F401
+from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/primitives/gemm/base.py b/tilelang/tileop/base.py
similarity index 56%
rename from tilelang/primitives/gemm/base.py
rename to tilelang/tileop/base.py
index 827ff78f9..f7b51b3ac 100644
--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/tileop/base.py
@@ -1,8 +1,5 @@
 from __future__ import annotations
 from enum import IntEnum
-from dataclasses import dataclass
-
-from tvm import tir
 
 
 class GemmWarpPolicy(IntEnum):
@@ -131,7 +128,7 @@ def compute_warp_partition(self, M, N, num_warps):
             # Try to find the best balanced partition
             best_m = 1
             best_n = 1
-            best_balance = float('inf')
+            best_balance = float("inf")
 
             # Try all possible combinations that satisfy the constraints
             for m in range(1, min(max_m_warps, num_warps) + 1):
@@ -186,130 +183,3 @@ def from_warp_partition(cls, m_warp: int, n_warp: int) -> GemmWarpPolicy:
             return cls.FullCol
         else:
             return cls.Square
-
-
-@dataclass
-class GemmBaseParams:
-    # OP Related Config
-    A: tir.Buffer
-    B: tir.Buffer
-    C: tir.Buffer
-
-    transpose_A: bool = False
-    transpose_B: bool = False
-    block_row_warps: int | None = None
-    block_col_warps: int | None = None
-    warp_row_tiles: int | None = None
-    warp_col_tiles: int | None = None
-    chunk: int | None = None
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1
-
-    def get_warp_size(self) -> int:
-        # must rewrite to 64 if the target
-        # is cdna mfma
-        return 32
-
-    def params_as_dict(self):
-        return {
-            "A": self.A,
-            "B": self.B,
-            "C": self.C,
-            "transpose_A": self.transpose_A,
-            "transpose_B": self.transpose_B,
-            "block_row_warps": self.block_row_warps,
-            "block_col_warps": self.block_col_warps,
-            "warp_row_tiles": self.warp_row_tiles,
-            "warp_col_tiles": self.warp_col_tiles,
-            "chunk": self.chunk,
-            "policy": self.policy,
-            "k_pack": self.k_pack,
-        }
-
-    def infer_block_partition(self, threads: int | None) -> None:
-        """
-        Infer and set block partition parameters (e.g., block_row_warps,
-        block_col_warps, warp_row_tiles, warp_col_tiles, chunk) based on the
-        shape of A and B. If these parameters are not already specified, the
-        method will attempt to infer them automatically based on the given
-        `threads`.
-
-        Parameters
-        ----------
-        threads : Optional[int]
-            The total number of threads in a block. Must be provided
-            if any block partition parameter is not already set.
-
-        Raises
-        ------
-        AssertionError
-            If `threads` is None but any block partition parameter is missing,
-            or if A and B have inconsistent shapes for GEMM.
-        """
-
-        warp_size = self.get_warp_size()
-        A, B = self.A, self.B
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        policy = self.policy
-
-        # The field `chunk` is not declared in GemmBaseParams by default.
-        # We infer it based on the K dimension of matrices.
-        # Initialize chunk from `self` if it exists; otherwise we infer it.
-        chunk = getattr(self, "chunk", None)
-
-        # Determine whether block partition parameters need to be inferred
-        require_infer = (
-            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or
-            warp_col_tiles is None or chunk is None)
-
-        A_shape, B_shape = A.shape, B.shape
-
-        if require_infer:
-            assert (threads is not None), "threads must be provided for auto inference"
-            # Auto-inference only supports 2D matrix multiplication
-            assert (
-                len(A_shape) == 2 and len(B_shape) == 2
-            ), f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
-
-            # Analyze A/B shapes
-            AM = A_shape[1] if transpose_A else A_shape[0]  # M dimension
-            BN = B_shape[0] if transpose_B else B_shape[1]  # N dimension
-            AK = A_shape[0] if transpose_A else A_shape[1]  # K dimension
-            BK = B_shape[1] if transpose_B else B_shape[0]  # K dimension
-            assert AK == BK, "A and B shape mismatch"
-
-            block_M = int(AM)
-            block_N = int(BN)
-            num_warps = threads // warp_size
-
-            # Infer block partition using a user-specified policy
-            block_row_warps, block_col_warps = policy.compute_warp_partition(
-                block_M, block_N, num_warps)
-            warp_row_tiles = block_M // block_row_warps
-            warp_col_tiles = block_N // block_col_warps
-            chunk = int(AK)
-
-        # rewrite the values
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-
-    @property
-    def class_attributes(self):
-        return self.params_as_dict()
-
-    def __repr__(self) -> str:
-        cls_name = self.__class__.__name__
-        fields = self.class_attributes
-        field_str = ", ".join(f"{key}={value!r}" for key, value in fields.items())
-        return f"{cls_name}({field_str})"
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 4c6762450..2c7484600 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -1,57 +1,35 @@
-from enum import IntEnum
 from tilelang import tvm as tvm
 from tvm import tir
 from tvm.target import Target
 from tvm.ir.base import Node
+from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
-from tilelang.ir import GemmWarpPolicy as GemmWarpPolicy
+from .inst import GemmInst
 from .gemm_mma import GemmMMA
 from .gemm_mma_sm70 import GemmMMASm70
 from .gemm_wgmma import GemmWGMMA
 from .gemm_tcgen05 import GemmTCGEN5
 from .gemm_mfma import GemmMFMA
+from .gemm_cutedsl import GemmCuTeDSL
 from tilelang import _ffi_api
 from tilelang.utils.target import target_is_volta
+from tilelang.jit.adapter.utils import is_cutedsl_target
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.infer_layout")
-def gemm_py_infer_layout(gemm_py, target, thread_bounds):
+def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range):
     thread_nums = thread_bounds.extent
     return gemm_py.infer_layout(target, thread_nums)
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py, layout_map, target, thread_bounds, thread_var):
-    thread_nums = thread_bounds.extent
-    stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
+def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range, thread_var: tir.Var):
+    # We pass thread_bounds rather than thread_extents because tcgen5mma need to check this
+    stmt = gemm_py.lower(layout_map, target, thread_bounds, thread_var)
     return stmt
 
 
-# TODO(lei): support Volta and WMMA?
-# same definition with src/op/gemm_py.h
-class GemmInst(IntEnum):
-    MMA = 0
-    WGMMA = 1
-    TCGEN5MMA = 2
-    MFMA = 3
-
-    def is_mma(self) -> bool:
-        return self == GemmInst.MMA
-
-    def is_wgmma(self) -> bool:
-        return self == GemmInst.WGMMA
-
-    def is_tcgen5mma(self) -> bool:
-        return self == GemmInst.TCGEN5MMA
-
-    def is_mfma(self) -> bool:
-        return self == GemmInst.MFMA
-
-    def __repr__(self) -> str:
-        return self.name
-
-
 @tvm_ffi.register_object("tl.GemmPy")
 class GemmPy(Node, Scriptable):
     # FFI fields (LLVM/MLIR-style lowerCamel via reflection):
@@ -139,11 +117,12 @@ def infer_layout(self, target: Target, thread_nums: int):
         impl_class = self._get_implementation_class(gemm_inst, target)
         return impl_class(self).infer_layout(target, thread_nums)
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
         """Lower the GEMM operation to TIR statements based on target architecture."""
+        thread_nums = thread_bounds.extent
         gemm_inst = self._select_gemm_instruction(thread_nums, target)
         impl_class = self._get_implementation_class(gemm_inst, target)
-        return impl_class(self).lower(layout_map, target, thread_nums, thread_var)
+        return impl_class(self).lower(layout_map, target, thread_bounds, thread_var)
 
     def _select_gemm_instruction(self, thread_nums: int, target: Target) -> GemmInst:
         """Select the appropriate GEMM instruction based on target and thread configuration.
@@ -168,6 +147,7 @@ def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
 
         Args:
             gemm_inst: The selected GEMM instruction type
+            target: Target architecture
 
         Returns:
             The implementation class for the instruction type
@@ -176,6 +156,10 @@ def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
             NotImplementedError: If the instruction type is not supported
             ValueError: If the instruction type is unknown
         """
+        # CuTeDSL backend uses direct intrinsic call, bypass complex lowering
+        if is_cutedsl_target(target):
+            return GemmCuTeDSL
+
         if gemm_inst.is_mma():
             if target_is_volta(target):
                 return GemmMMASm70
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 021f59a40..841183738 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -1,9 +1,13 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
+from tilelang import language as T
 from tilelang.utils.language import is_shared, is_fragment
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 from tvm.ir import PrimExpr
 
@@ -15,7 +19,7 @@ class GemmBase:
     def infer_layout(self, target: Target, thread_nums: int):
         raise NotImplementedError("infer_layout is not implemented")
 
-    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
         raise NotImplementedError("lower is not implemented")
 
     def is_gemm_ss(self) -> bool:
@@ -121,13 +125,17 @@ def policy(self) -> GemmWarpPolicy:
 
     @property
     def mbarptr(self) -> PrimExpr:
-        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, "uint32"))
+        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, T.uint32))
+
+    @property
+    def mbar(self) -> tir.Buffer | tir.BufferLoad:
+        return getattr(self.gemm_node, "mbar", None)
 
     @property
     def C_coords(self):
         coords = getattr(self.gemm_node, "cCoords", None)
         if coords is None or len(coords) == 0:
-            zero = tvm.tir.const(0, "int32")
+            zero = tvm.tir.const(0, T.int32)
             return [zero, zero]
         return [coords[i] for i in range(len(coords))]
 
diff --git a/tilelang/tileop/gemm/gemm_cutedsl.py b/tilelang/tileop/gemm/gemm_cutedsl.py
new file mode 100644
index 000000000..e98ffa4eb
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_cutedsl.py
@@ -0,0 +1,64 @@
+"""GEMM implementation for CuTeDSL backend - directly calls tl::gemm intrinsic."""
+
+from tilelang.tileop.gemm.gemm_base import GemmBase
+from tilelang import language as T
+from tvm import tir
+from tvm.target import Target
+from tvm.ir import Range
+
+
+class GemmCuTeDSL(GemmBase):
+    """GEMM implementation for CuTeDSL that directly calls tl::gemm intrinsic.
+
+    This implementation bypasses the complex lowering logic of MMA/WGMMA
+    and directly emits a call to tl::gemm, similar to gemm_v1 behavior.
+    This is necessary for CuTeDSL backend which requires simpler IR.
+    """
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        """For CuTeDSL, we still need proper layout inference for A, B, C buffers.
+
+        CuTeDSL uses the same underlying hardware instructions (WGMMA/MMA),
+        so it needs the same layout information. We delegate to the appropriate
+        implementation based on the instruction type.
+        """
+        from tilelang.tileop.gemm import GemmInst
+        from tilelang.tileop.gemm.gemm_wgmma import GemmWGMMA
+        from tilelang.tileop.gemm.gemm_mma import GemmMMA
+        from tilelang import _ffi_api
+
+        # Determine which GEMM instruction will be used
+        gemm_inst = GemmInst(_ffi_api.GemmPyGemmInst(self.gemm_node, int(thread_nums), target))
+
+        # Use WGMMA or MMA layout inference based on instruction type
+        if gemm_inst.is_wgmma():
+            return GemmWGMMA(self.gemm_node).infer_layout(target, thread_nums)
+        else:
+            return GemmMMA(self.gemm_node).infer_layout(target, thread_nums)
+
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        """Lower to a direct gemm_v1 call without complex MMA/WGMMA lowering."""
+        from tilelang.language.gemm_op import gemm_v1
+        from tilelang.transform.simplify import _Simplify
+        from tilelang.tileop.base import GemmWarpPolicy as PyGemmWarpPolicy
+
+        # Convert C++ GemmWarpPolicy to Python enum value (int)
+        policy_int = self.policy.policy_type
+
+        @T.prim_func
+        def _gemm_cutedsl() -> None:
+            gemm_v1(
+                self.A,
+                self.B,
+                self.C,
+                self.trans_A,
+                self.trans_B,
+                PyGemmWarpPolicy(policy_int),
+                self.clear_accum,
+                self.k_pack,
+                self.wg_wait,
+                self.mbar,
+            )
+
+        # Simplify and return
+        return _Simplify(_gemm_cutedsl, inline_let=True)
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
index 45a53d3c0..b22becfee 100644
--- a/tilelang/tileop/gemm/gemm_mfma.py
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -1,20 +1,21 @@
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 
 
 class GemmMFMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MFMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -28,6 +29,7 @@ def infer_layout(self, target: Target, thread_nums: int):
             warp_row_tiles=warp_row_tiles,
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
+            k_pack=self.k_pack,
         )
 
         if self.is_gemm_ss():
@@ -55,12 +57,11 @@ def infer_layout(self, target: Target, thread_nums: int):
                 self.C: mfma_emitter.make_mfma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MFMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -75,6 +76,7 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
             thread_var=thread_var,
+            k_pack=self.k_pack,
         )
 
         in_dtype = self.in_dtype
@@ -110,11 +112,11 @@ def _gemm_ssr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -145,13 +147,12 @@ def _gemm_srr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
 
                 if clear_accum:
                     T.clear(C_buf)
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -177,11 +178,10 @@ def _gemm_rsr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
-
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
                         B_local,
@@ -207,7 +207,7 @@ def _gemm_rsr() -> None:
                 accumulating into C_local.
                 """
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Perform Matrix Multiplication
                     mfma_emitter.mfma(A_buf, B_buf, C_buf, ki)
 
@@ -215,8 +215,7 @@ def _gemm_rsr() -> None:
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma.py b/tilelang/tileop/gemm/gemm_mma.py
index ce27409bb..52b08c146 100644
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -1,20 +1,21 @@
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -54,12 +55,11 @@ def infer_layout(self, target: Target, thread_nums: int):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -177,7 +177,6 @@ def _gemm_rsr() -> None:
                 if clear_accum:
                     T.clear(C_buf)
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -211,8 +210,7 @@ def _gemm_rrr() -> None:
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rrr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma_sm70.py b/tilelang/tileop/gemm/gemm_mma_sm70.py
index 12b729c27..7924109b9 100644
--- a/tilelang/tileop/gemm/gemm_mma_sm70.py
+++ b/tilelang/tileop/gemm/gemm_mma_sm70.py
@@ -1,21 +1,22 @@
 # for Volta GPUs, which use legacy MMA instructions
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_volta_swizzled_layout
 from tilelang.intrinsics.mma_sm70_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMASm70(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -45,12 +46,11 @@ def infer_layout(self, target: Target, thread_nums: int):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -140,7 +140,6 @@ def _gemm_rsr() -> None:
                     T.clear(C_buf)
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -155,8 +154,7 @@ def _gemm_rsr() -> None:
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 4ffe4ad0c..32bef1cbd 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -1,11 +1,16 @@
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_tcgen05mma_swizzled_layout
 from tilelang.intrinsics.tcgen05_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 from tvm import tir
 from tvm.target import Target
+from tvm.ir import Range
+from tvm.arith import Analyzer
+
 
 _FLOAT8_DTYPES = {
     "float8_e4m3",
@@ -18,10 +23,8 @@
 
 
 class GemmTCGEN5(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.TCGEN5MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -40,27 +43,21 @@ def infer_layout(self, target: Target, thread_nums: int):
         b_is_k_major = self.trans_B
 
         if self.is_gemm_ss():
-
             a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
             b_continuity = self.K if b_is_k_major else self.N // n_warp
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_tcgen05mma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_tcgen05mma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_tcgen05mma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_tcgen05mma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         # No special swizzle requirement; rely on existing layout.
         return {}
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.TCGEN5MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -82,10 +79,9 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
             mma_emitter._assign_b_shared_layout(layout_map[self.B])
 
         if not self.is_gemm_ss():
-            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
-                             f"A scope {self.A.scope()}, B scope {self.B.scope()}")
+            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got A scope {self.A.scope()}, B scope {self.B.scope()}")
 
-        atom_m, atom_n, atom_k = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
 
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
@@ -96,27 +92,44 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
         if self.wg_wait != -1:
             raise ValueError("TCGEN5MMA currently requires wg_wait == -1")
 
-        mbarptr = self.mbarptr
-        if mbarptr == 0:
-            raise ValueError("TCGEN5MMA requires a valid mbarrier pointer")
+        mbar = self.mbar
+        if mbar == 0:
+            raise ValueError("TCGEN5MMA requires a valid mbarrier")
+
+        mbarptr = mbar.access_ptr("rw")
 
         C_coords = self.C_coords
         if len(C_coords) != 2:
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype != "float32":
+        if accum_dtype not in [str(T.float32), str(T.float16), str(T.int32)]:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
         B_shared = self.BRegion
         C_local = self.C
         clear_accum = self.clear_accum
-        mbar = self.mbarptr
+
+        # Since TCGEN5MMA atoms provided by CUTLASS always have an internal
+        # `elect_one_sync()`, we check if we are calling it using full warps
+        analyzer = Analyzer()
+        warp_size = 32
+        assert analyzer.can_prove(thread_bounds.min % warp_size == 0 and thread_bounds.extent % warp_size == 0), (
+            "TCGEN5MMA requires thread bounds to be multiples of warp size (32) and aligned to warps."
+        )
+
+        @T.prim_func
+        def _gemm_ss_cond() -> None:
+            if thread_var // 32 == thread_bounds.min // warp_size:
+                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
 
         @T.prim_func
         def _gemm_ss() -> None:
-            if thread_var // 32 == 0:
-                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbar, clear_accum)
+            mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
 
-        return _Simplify(_gemm_ss, inline_let=True)
+        return (
+            _Simplify(_gemm_ss, inline_let=True)
+            if analyzer.can_prove(thread_bounds.extent == warp_size)
+            else _Simplify(_gemm_ss_cond, inline_let=True)
+        )
diff --git a/tilelang/tileop/gemm/gemm_wgmma.py b/tilelang/tileop/gemm/gemm_wgmma.py
index 2325f45df..c1fb6be09 100644
--- a/tilelang/tileop/gemm/gemm_wgmma.py
+++ b/tilelang/tileop/gemm/gemm_wgmma.py
@@ -1,20 +1,51 @@
 from .gemm_base import GemmBase
-from tilelang.layout import make_wgmma_swizzled_layout
+from .inst import GemmInst
+from tilelang.layout import (
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+    make_linear_layout,
+    Layout,
+)
 from tilelang.intrinsics.wgmma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
+from typing import Callable
 
 
 class GemmWGMMA(GemmBase):
+    def infer_shared_layout(self, continuity: int) -> Callable[[tir.Buffer], Layout]:
+        """Infer the swizzle layout for shared memory based on continuity.
+
+        WGMMA can directly use shared memory as input, so the swizzle layout must
+        match the tensor core's access pattern. The swizzle granularity is determined
+        by the continuous dimension size:
+          - 128B swizzle (Full):    continuity % (vectorized_size * 8) == 0
+          - 64B swizzle (Half):     continuity % (vectorized_size * 4) == 0
+          - 32B swizzle (Quarter):  continuity % (vectorized_size * 2) == 0
+          - Linear (no swizzle):    otherwise
+
+        See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+        """
+        vectorized_size = 128 // self.in_dtype.bits
+        if continuity % (vectorized_size * 8) == 0:
+            return make_full_bank_swizzled_layout
+        elif continuity % (vectorized_size * 4) == 0:
+            return make_half_bank_swizzled_layout
+        elif continuity % (vectorized_size * 2) == 0:
+            return make_quarter_bank_swizzled_layout
+        else:
+            return make_linear_layout
 
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.WGMMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -31,41 +62,27 @@ def infer_layout(self, target: Target, thread_nums: int):
         )
         a_is_k_major = not self.trans_A
         b_is_k_major = self.trans_B
-
+        a_continuity = self.K if a_is_k_major else mma_emitter.wgmma_inst_m
+        b_continuity = self.K if b_is_k_major else mma_emitter.wgmma_inst_n
         if self.is_gemm_ss():
-            a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
-            b_continuity = self.K if b_is_k_major else self.N // n_warp
-
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_wgmma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: self.infer_shared_layout(a_continuity)(self.A),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         elif self.is_gemm_rs():
-            b_continuity = self.N if b_is_k_major else 4 * self.K // n_warp
             return {
-                self.A:
-                    mma_emitter.make_mma_load_layout(self.A, matrix="A"),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
-
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+            raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
+    def lower(self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.WGMMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -133,8 +150,7 @@ def _gemm_rsr() -> None:
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
-        raise ValueError(
-            f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+        raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/inst.py b/tilelang/tileop/gemm/inst.py
new file mode 100644
index 000000000..6d676dcae
--- /dev/null
+++ b/tilelang/tileop/gemm/inst.py
@@ -0,0 +1,25 @@
+from enum import IntEnum
+
+
+# TODO(lei): support Volta and WMMA?
+# same definition with src/op/gemm.h
+class GemmInst(IntEnum):
+    MMA = 0
+    WGMMA = 1
+    TCGEN5MMA = 2
+    MFMA = 3
+
+    def is_mma(self) -> bool:
+        return self == GemmInst.MMA
+
+    def is_wgmma(self) -> bool:
+        return self == GemmInst.WGMMA
+
+    def is_tcgen5mma(self) -> bool:
+        return self == GemmInst.TCGEN5MMA
+
+    def is_mfma(self) -> bool:
+        return self == GemmInst.MFMA
+
+    def __repr__(self) -> str:
+        return self.name
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
new file mode 100644
index 000000000..1d75657ec
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -0,0 +1,69 @@
+from tilelang import tvm as tvm
+from tvm import tir
+from tilelang.utils.target import (
+    target_is_cuda,
+)
+from tvm.target import Target
+from tvm.ir.base import Node
+from tvm.ir import Range
+from tvm.runtime import Scriptable
+import tvm_ffi
+from tilelang.tileop.base import GemmWarpPolicy
+from .gemm_sp_mma import GemmSPMMA
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.infer_layout")
+def gemm_sp_py_infer_layout(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range):
+    thread_nums = thread_bounds.extent
+    return gemm_sp_py.infer_layout(target, thread_nums)
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.lower")
+def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range, thread_var: tir.Var):
+    thread_nums = thread_bounds.extent
+    stmt = gemm_sp_py.lower(target, thread_nums, thread_var)
+    return stmt
+
+
+@tvm_ffi.register_object("tl.GemmSPPy")
+class GemmSPPy(Node, Scriptable):
+    A: tir.Buffer
+    E: tir.Buffer
+    B: tir.Buffer
+    C: tir.Buffer
+
+    APtr: tir.PrimExpr
+    EPtr: tir.PrimExpr
+    BPtr: tir.PrimExpr
+    CPtr: tir.PrimExpr
+
+    M: int
+    N: int
+    K: int
+
+    trans_A: bool
+    trans_B: bool
+
+    stride_A: int
+    stride_B: int
+    offset_A: int
+    offset_B: int
+    clear_accum: bool
+    k_pack: int
+    wg_wait: int
+    policy: GemmWarpPolicy
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            return GemmSPMMA(self).infer_layout(target, thread_nums)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            # Now only implement ssr layout
+            return GemmSPMMA(self).lower(target, thread_nums, thread_var)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
new file mode 100644
index 000000000..8226a0664
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang.tileop.base import GemmWarpPolicy
+from tvm.ir.base import Node
+
+
+@dataclass
+class GemmSPBase:
+    gemm_sp_node: Node
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        raise NotImplementedError("infer_layout is not implemented")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        raise NotImplementedError("lower is not implemented")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
+
+    @property
+    def M(self) -> int:
+        return self.gemm_sp_node.M
+
+    @property
+    def N(self) -> int:
+        return self.gemm_sp_node.N
+
+    @property
+    def K(self) -> int:
+        return self.gemm_sp_node.K
+
+    @property
+    def trans_A(self) -> bool:
+        return self.gemm_sp_node.trans_A
+
+    @property
+    def trans_B(self) -> bool:
+        return self.gemm_sp_node.trans_B
+
+    @property
+    def trans_E(self) -> bool:
+        return self.gemm_sp_node.trans_E
+
+    @property
+    def e_dtype(self) -> str:
+        return self.E.dtype
+
+    @property
+    def in_dtype(self) -> str:
+        assert self.A.dtype == self.B.dtype, "A and B must have the same dtype"
+        return self.A.dtype
+
+    @property
+    def accum_dtype(self) -> str:
+        return self.C.dtype
+
+    @property
+    def A(self) -> tir.Buffer:
+        return self.gemm_sp_node.A
+
+    @property
+    def E(self) -> tir.Buffer:
+        return self.gemm_sp_node.E
+
+    @property
+    def B(self) -> tir.Buffer:
+        return self.gemm_sp_node.B
+
+    @property
+    def C(self) -> tir.Buffer:
+        return self.gemm_sp_node.C
+
+    @property
+    def ARegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ARegion
+
+    @property
+    def ERegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ERegion
+
+    @property
+    def BRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.BRegion
+
+    @property
+    def CRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.CRegion
+
+    @property
+    def stride_A(self) -> int:
+        return self.gemm_sp_node.stride_A
+
+    @property
+    def stride_B(self) -> int:
+        return self.gemm_sp_node.stride_B
+
+    @property
+    def offset_A(self) -> int:
+        return self.gemm_sp_node.offset_A
+
+    @property
+    def offset_B(self) -> int:
+        return self.gemm_sp_node.offset_B
+
+    @property
+    def clear_accum(self) -> bool:
+        return self.gemm_sp_node.clear_accum
+
+    @property
+    def k_pack(self) -> int:
+        return self.gemm_sp_node.k_pack
+
+    @property
+    def wg_wait(self) -> int:
+        return self.gemm_sp_node.wg_wait
+
+    @property
+    def policy(self) -> GemmWarpPolicy:
+        return self.gemm_sp_node.policy
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
new file mode 100644
index 000000000..9f1a013ba
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -0,0 +1,257 @@
+from .gemm_sp_base import GemmSPBase
+from tilelang.tileop.gemm.inst import GemmInst
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmSPMMA(GemmSPBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        # NOTE(wt): Actually gemm_sp v2 currently use GemmWarpPolicy
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+        )
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        # NOTE(wt): Actually gemm_sp v2 currently use GemmWarpPolicy
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+            thread_var=thread_var,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mma_emitter.warp_rows
+        warp_cols = mma_emitter.warp_cols
+        local_size_a = mma_emitter.local_size_a
+        local_size_e = mma_emitter.local_size_e
+        local_size_b = mma_emitter.local_size_b
+        micro_size_k = mma_emitter.micro_size_k
+        A_shared = self.A
+        E_shared = self.E
+        B_shared = self.B
+        C_local = self.C
+        clear_accum = self.clear_accum
+        assert micro_size_k <= self.K, f"K dimension {self.K} should be >= micro size k {micro_size_k}"
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_sr():
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            # alloc_buffers body
+            # insert into parent block
+            return _Simplify(_gemm_srr, inline_let=True)
+        elif self.is_gemm_rs():
+            A_local = self.A
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        elif self.is_gemm_rr():
+            A_local = self.A
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_rrr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rrr, inline_let=True)
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tools/Analyzer.py b/tilelang/tools/Analyzer.py
index 205c647e3..bb406ac25 100644
--- a/tilelang/tools/Analyzer.py
+++ b/tilelang/tools/Analyzer.py
@@ -4,11 +4,11 @@
 from tilelang import tvm
 from tvm.tir.stmt_functor import ir_transform
 import logging
+
 # Configuration for different hardware architectures.
 # Each entry contains: (cores per SM, default clock (GHz), FLOPs per cycle, max SM count)
 ARCH_CONFIGS = {"80": (128, 1.41, 2, 108), "86": (128, 1.70, 2, 84), "89": (128, 2.52, 2, 128)}
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -23,6 +23,7 @@ class AnalysisResult:
         tflops: Achieved TFLOPS (trillions of FLOPs per second).
         bandwidth_GBps: Achieved memory bandwidth in GB/s.
     """
+
     total_flops: int
     total_global_bytes: int
     estimated_time: float
@@ -81,7 +82,7 @@ def _analyze_copy(self, call):
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         total_bytes = bytes_transferred * loop_product * total_blocks
         self.total_global_bytes += total_bytes
@@ -100,7 +101,7 @@ def _analyze_gemm(self, call):
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         self.total_flops += flops_per_call * loop_product * total_blocks
 
@@ -127,8 +128,7 @@ def _pre_visit(stmt):
                         iter_var = stmt.node
                         thread_tag = iter_var.thread_tag
                         if thread_tag in self.block_counts:
-                            extent = stmt.value.value if hasattr(stmt.value,
-                                                                 'value') else stmt.value
+                            extent = stmt.value.value if hasattr(stmt.value, "value") else stmt.value
                             self.block_counts[thread_tag] = extent
                 elif isinstance(stmt, tvm.tir.For):
                     # Push loop extent onto the stack
@@ -178,9 +178,7 @@ def get_peak_tflops(device) -> float | None:
             """
             arch_key = device.compute_capability[:2]
             if arch_key not in ARCH_CONFIGS:
-                logger.info(
-                    f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None"
-                )
+                logger.info(f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None")
                 return None
 
             cores_per_sm, default_clock, flops_per_cycle, compute_max_core = ARCH_CONFIGS[arch_key]
@@ -203,7 +201,8 @@ def get_peak_tflops(device) -> float | None:
             total_global_bytes=self.total_global_bytes,
             estimated_time=estimated_time,
             expected_tflops=peak_tflops,
-            expected_bandwidth_GBps=bandwidth_GBps)
+            expected_bandwidth_GBps=bandwidth_GBps,
+        )
 
     @classmethod
     def analysis(cls, fn, device):
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 291da2571..963931c94 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -1,33 +1,103 @@
+from __future__ import annotations
 import tilelang.language as T
+import itertools
 
 
-def plot_layout(layout: T.Layout,
-                save_directory="./tmp",
-                name: str = "layout",
-                colormap: str = "RdPu",
-                verbose: bool = False) -> None:
+def plot_layout(
+    layout,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = None,
+    verbose: bool = False,
+    formats: str | list[str] = "pdf",
+) -> None:
     """
-    Plot the layout of a buffer.
+    Plot the layout mapping as a 2D grid visualization.
+
+    Dispatches to Fragment-specific or Layout-specific plotting based on the
+    type of the layout object.
 
     Parameters
     ----------
-    layout : T.Layout
-        The layout object that describes how indices are mapped.
+    layout : T.Layout or T.Fragment
+        The layout object to visualize.
     save_directory : str, optional
-        The directory where the output images will be saved (default is "./tmp").
+        Output directory (default "./tmp").
     name : str, optional
-        The base name of the output files (default is "layout").
+        Base filename for saved images (default "layout").
     colormap : str, optional
-        The colormap to use for visualization (default is "RdPu").
+        Matplotlib colormap name. Defaults to "RdPu" for Fragment, "Spectral" for Layout.
     verbose : bool, optional
-        If True, prints additional information about the mapping (default is False).
-
-    Returns
-    -------
-    None
+        If True, print mapping details.
+    formats : str | list[str], optional
+        Output format(s): "pdf", "png", "svg", "all", or comma-separated (default "pdf").
     """
+    from tilelang.layout.fragment import Fragment
+
+    if isinstance(layout, Fragment):
+        _plot_fragment_layout(layout, save_directory, name, colormap or "RdPu", verbose, formats)
+    elif isinstance(layout, T.Layout):
+        _plot_layout_map(layout, save_directory, name, colormap or "Spectral", verbose, formats)
+    else:
+        raise TypeError(f"Expected T.Layout or T.Fragment, but got {type(layout).__name__}.")
+
+
+def _parse_formats(formats):
+    """Parse the formats parameter into a list of format strings."""
+    if isinstance(formats, str):
+        formats_str = formats.strip().lower()
+        if formats_str == "all":
+            return ["pdf", "png", "svg"]
+        elif "," in formats_str:
+            return [f.strip() for f in formats_str.split(",")]
+        else:
+            return [formats_str]
+    else:
+        raise TypeError(
+            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+        )
+
+
+def _save_plot(plt, save_directory, name, formats):
+    """Save the current matplotlib figure in the specified format(s)."""
     import os
     import pathlib
+
+    formats_list = _parse_formats(formats)
+
+    tmp_directory = pathlib.Path(save_directory)
+    if not os.path.exists(tmp_directory):
+        os.makedirs(tmp_directory)
+
+    if "pdf" in formats_list:
+        pdf_path = tmp_directory / f"{name}.pdf"
+        plt.savefig(pdf_path, bbox_inches="tight")
+        print(f"Saved pdf format into {pdf_path}")
+
+    if "png" in formats_list:
+        png_path = tmp_directory / f"{name}.png"
+        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
+        print(f"Saved png format into {png_path}")
+
+    if "svg" in formats_list:
+        svg_path = tmp_directory / f"{name}.svg"
+        plt.savefig(svg_path, bbox_inches="tight", format="svg")
+        print(f"Saved svg format into {svg_path}")
+
+
+# ---------------------------------------------------------------------------
+# Fragment-specific layout visualization (thread ID + local ID per cell)
+# ---------------------------------------------------------------------------
+
+
+def _plot_fragment_layout(
+    layout: T.Fragment,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "RdPu",
+    verbose: bool = False,
+    formats: str | list[str] = "pdf",
+) -> None:
     import numpy as np
     import matplotlib.pyplot as plt
     import matplotlib.patches as patches
@@ -40,8 +110,6 @@ def plot_layout(layout: T.Layout,
     # Get the total number of threads
     num_threads = int(layout.get_thread_size())
 
-    import itertools
-
     # Initialize a 2D array to store thread mappings
     thread_map = np.empty(input_shape, dtype=object)
     for idx in np.ndindex(thread_map.shape):
@@ -82,6 +150,23 @@ def plot_layout(layout: T.Layout,
     raw_colors = [cmap(i) for i in range(num_threads)]
     colors = raw_colors.copy()
 
+    # Show the distribution of registers in each thread of a warp.
+    warp_size = 32
+    # Warn if the number of threads is less than the warp size
+    if num_threads < warp_size:
+        import warnings
+
+        warnings.warn(
+            f"Layout visualization has {num_threads} threads, which is less than the warp size ({warp_size}). "
+            f"For the best viewing experience, it is recommended to have at least {warp_size} threads.",
+            UserWarning,
+            stacklevel=2,
+        )
+    spectral_camp = plt.get_cmap("hsv", warp_size * 6)
+
+    for i in range(min(warp_size, num_threads)):
+        colors[i] = spectral_camp(i * 6)
+
     # Determine the number of rows and columns in the input shape
     nrows, ncols = input_shape
     # Adjust figure size to maintain square cells
@@ -100,12 +185,7 @@ def plot_layout(layout: T.Layout,
 
             color = colors[thread_ids[0]]  # Select color based on thread ID
             # Create a rectangle patch for visualization
-            rect = patches.Rectangle((j, i),
-                                     1,
-                                     1,
-                                     linewidth=0.5,
-                                     edgecolor='black',
-                                     facecolor=color)
+            rect = patches.Rectangle((j, i), 1, 1, linewidth=0.5, edgecolor="black", facecolor=color)
             ax.add_patch(rect)  # Add the rectangle to the plot
 
             # Add text annotations inside the rectangles
@@ -121,41 +201,19 @@ def plot_layout(layout: T.Layout,
             thread_fontsize = min(font_size, font_size * (4 / len(thread_str)))
 
             # Add thread ID text with adjusted font size
-            ax.text(
-                j + 0.5,
-                i + 0.3,
-                thread_str,
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=thread_fontsize)
+            ax.text(j + 0.5, i + 0.3, thread_str, ha="center", va="center", color="black", fontsize=thread_fontsize)
             # Add local ID text with original font size
-            ax.text(
-                j + 0.5,
-                i + 0.7,
-                f"L{local_id}",
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=font_size)
+            ax.text(j + 0.5, i + 0.7, f"L{local_id}", ha="center", va="center", color="black", fontsize=font_size)
 
     # Add row labels to the left side of the plot
     for i in range(nrows):
         text = f"row {i}"
-        ax.text(-0.75, i + 0.5, text, ha='center', va='center', color='black', fontsize=font_size)
+        ax.text(-0.75, i + 0.5, text, ha="center", va="center", color="black", fontsize=font_size)
 
     # Add column labels at the top of the plot
     for j in range(ncols):
         text = f"col {j}"
-        ax.text(
-            j + 0.5,
-            -0.5,
-            text,
-            ha='center',
-            va='center',
-            color='black',
-            fontsize=font_size,
-            rotation=45)
+        ax.text(j + 0.5, -0.5, text, ha="center", va="center", color="black", fontsize=font_size, rotation=45)
 
     # Set the plot limits
     ax.set_xlim(0, ncols)
@@ -171,37 +229,300 @@ def plot_layout(layout: T.Layout,
     legend_x = 1.0 + (0.5 / fig_width)  # Adjust x position based on figure width
     legend_y = 1.0 + (1.7 / fig_height)  # Adjust y position based on figure height
 
-    legend_patches = [
-        patches.Patch(color='black', label="T: Thread ID"),
-        patches.Patch(color='black', label="L: Local ID")
-    ]
+    legend_patches = [patches.Patch(color="black", label="T: Thread ID"), patches.Patch(color="black", label="L: Local ID")]
     ax.legend(
         handles=legend_patches,
         loc="upper right",
         fontsize=font_size - 4,
         frameon=False,
         bbox_to_anchor=(legend_x, legend_y),  # Dynamic position
-        ncols=2)
+        ncols=2,
+    )
 
-    # Create the output directory if it does not exist
-    tmp_directory = pathlib.Path(save_directory)
-    if not os.path.exists(tmp_directory):
-        os.makedirs(tmp_directory)
+    plt.tight_layout()
+    _save_plot(plt, save_directory, name, formats)
+    plt.close()
+
+
+# ---------------------------------------------------------------------------
+# Layout-specific visualization (position mapping, no thread/local ID)
+# ---------------------------------------------------------------------------
+
+
+def _plot_layout_map(
+    layout: T.Layout,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "Spectral",
+    verbose: bool = False,
+    formats: str | list[str] = "pdf",
+) -> None:
+    """
+    Visualize a Layout object as a 2D grid showing position mappings.
+
+    The grid represents the output space (viewed as 2D by keeping the last
+    dimension and flattening all preceding dimensions).  Each cell displays the
+    original input coordinate that maps to that output position.
+
+    Parameters
+    ----------
+    layout : T.Layout
+        The layout object to visualize.
+    save_directory : str
+        Output directory.
+    name : str
+        Base filename.
+    colormap : str
+        Matplotlib colormap for coloring cells by source position.
+    verbose : bool
+        Print mapping details.
+    formats : str | list[str]
+        Output format(s).
+    """
+    import functools
+    import operator
+    import numpy as np
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+
+    input_shape = [int(v) for v in layout.get_input_shape()]
+    total_in = functools.reduce(operator.mul, input_shape, 1)
+
+    # -- helpers for N-D → 2-D conversion --------------------------------
+
+    def _flatten_to_2d(shape):
+        """Keep last dim, merge all preceding dims into one row dim."""
+        if len(shape) <= 1:
+            return (1, shape[0]) if shape else (1, 1)
+        return (functools.reduce(operator.mul, shape[:-1], 1), shape[-1])
+
+    def _nd_to_2d(idx, shape):
+        """Convert an N-D index to (row, col) in the flattened 2-D view."""
+        if len(shape) <= 1:
+            return (0, idx[0]) if shape else (0, 0)
+        row = 0
+        for k in range(len(shape) - 1):
+            row = row * shape[k] + idx[k]
+        return (row, idx[-1])
+
+    # -- collect all input→output mappings ---------------------------------
+
+    mappings = []
+    num_out_dims = None
+    for in_idx in itertools.product(*[range(d) for d in input_shape]):
+        out_vals = layout.map_forward_index(list(in_idx))
+        out_idx = tuple(int(v) for v in out_vals)
+        if num_out_dims is None:
+            num_out_dims = len(out_idx)
+        mappings.append((tuple(in_idx), out_idx))
+
+    # determine output shape from actual output indices
+    output_shape = [0] * num_out_dims
+    for _, out_idx in mappings:
+        for k in range(num_out_dims):
+            output_shape[k] = max(output_shape[k], out_idx[k] + 1)
+
+    out_rows, out_cols = _flatten_to_2d(output_shape)
+
+    if verbose:
+        print(f"Input shape : {input_shape}")
+        print(f"Output shape: {output_shape}")
+        print(f"Grid size   : {out_rows} x {out_cols}")
+
+    # -- build the output grid ---------------------------------------------
+
+    grid_labels = [[None] * out_cols for _ in range(out_rows)]
+    grid_src_flat = np.full((out_rows, out_cols), -1, dtype=int)
+
+    for in_idx, out_idx in mappings:
+        out_r, out_c = _nd_to_2d(out_idx, output_shape)
+        # flat source index for colour mapping
+        src_flat = 0
+        for k in range(len(input_shape)):
+            src_flat = src_flat * input_shape[k] + in_idx[k]
+
+        grid_labels[out_r][out_c] = list(in_idx)
+        grid_src_flat[out_r, out_c] = src_flat
+
+        if verbose:
+            print(f"  {list(in_idx)} -> {list(out_idx)} -> grid[{out_r}, {out_c}]")
+
+    # -- plotting ----------------------------------------------------------
+
+    cmap = plt.get_cmap(colormap, max(total_in, 2))
+
+    # dynamic sizing
+    max_dim = max(out_rows, out_cols, 1)
+    cell_size = max(0.5, min(1.2, 16.0 / max_dim))
+
+    fig_w = cell_size * out_cols + 1.5
+    fig_h = cell_size * out_rows + 1.0
+    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+    # font size: adapt to cell size and longest label
+    sample_label = "[" + ",".join(str(d - 1) for d in input_shape) + "]"
+    max_label_len = len(sample_label)
+    cell_pts = cell_size * 72  # approximate cell width in points
+    base_font = max(5, min(16, cell_pts * 0.9 / max(max_label_len * 0.55, 1)))
+
+    for i in range(out_rows):
+        for j in range(out_cols):
+            sf = grid_src_flat[i, j]
+            if sf >= 0:
+                color = cmap(sf / max(total_in - 1, 1))
+            else:
+                color = (0.95, 0.95, 0.95, 1.0)
+
+            rect = patches.Rectangle(
+                (j, i),
+                1,
+                1,
+                linewidth=0.8,
+                edgecolor="#aaaaaa",
+                facecolor=color,
+            )
+            ax.add_patch(rect)
+
+            coords = grid_labels[i][j]
+            if coords is not None:
+                label = "[" + ",".join(str(x) for x in coords) + "]"
+                r, g, b = color[0], color[1], color[2]
+                brightness = r * 0.299 + g * 0.587 + b * 0.114
+                text_color = "white" if brightness < 0.5 else "black"
+                ax.text(
+                    j + 0.5,
+                    i + 0.5,
+                    label,
+                    ha="center",
+                    va="center",
+                    color=text_color,
+                    fontsize=base_font,
+                    fontfamily="monospace",
+                    fontweight="bold",
+                )
+
+    # axis labels
+    label_font = max(5, min(10, base_font * 0.85))
+    # row labels on the left
+    for i in range(out_rows):
+        ax.text(-0.15, i + 0.5, str(i), ha="right", va="center", fontsize=label_font, color="#666666")
+    # column labels at the bottom
+    for j in range(out_cols):
+        ax.text(j + 0.5, out_rows + 0.15, str(j), ha="center", va="top", fontsize=label_font, color="#666666")
+
+    ax.set_xlim(-0.3, out_cols)
+    ax.set_ylim(-0.1, out_rows + 0.5)
+    ax.invert_yaxis()
+    ax.set_aspect("equal")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+
+    # outer border
+    border = patches.Rectangle(
+        (0, 0),
+        out_cols,
+        out_rows,
+        linewidth=1.5,
+        edgecolor="#333333",
+        facecolor="none",
+    )
+    ax.add_patch(border)
+
+    # title: show shape transformation
+    in_str = "x".join(str(d) for d in input_shape)
+    out_str = "x".join(str(d) for d in output_shape)
+    title_font = max(8, min(14, base_font * 1.1))
+    ax.set_title(f"[{in_str}] -> [{out_str}]", fontsize=title_font, color="#333333", pad=8)
 
-    # Save the figure in multiple formats
     plt.tight_layout()
+    _save_plot(plt, save_directory, name, formats)
+    plt.close()
+
 
-    # Save as PDF
-    pdf_path = tmp_directory / f"{name}.pdf"
-    plt.savefig(pdf_path, bbox_inches="tight")
-    print(f"Saved pdf format into {pdf_path}")
+# ---------------------------------------------------------------------------
+# Fragment thread-value (TV) view
+# ---------------------------------------------------------------------------
 
-    # Save as PNG
-    png_path = tmp_directory / f"{name}.png"
-    plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
-    print(f"Saved png format into {png_path}")
 
-    # Save as SVG
-    svg_path = tmp_directory / f"{name}.svg"
-    plt.savefig(svg_path, bbox_inches="tight", format="svg")
-    print(f"Saved svg format into {svg_path}")
+def plot_fragment_tv(
+    frag: T.Fragment,
+    save_directory: str | None = None,
+    name: str = "layout",
+    apply_idx_fn=lambda *args: args,
+    colormap: str = "RdPu",
+    item_scale: float = 0.75,
+    formats: str | list[str] = "pdf",
+    dpi=80,
+):
+    """
+    Plot fragment in terms of thread and local index mapping.
+    Parameters
+    ----------
+    frag : T.Fragment
+        The fragment object that describes how indices are mapped.
+    save_directory : str | None, optional
+        The directory where the output images will be saved.
+    name : str, optional
+        The base name of the output files (default is "layout").
+    apply_idx_fn : function, optional
+        A function to apply to the source indices for labeling (default is identity).
+    colormap : str, optional
+        The colormap to use for visualization (default is "RdPu").
+    item_scale : float, optional
+        The scale factor for each item in the plot (default is 0.75).
+    formats : str | list[str], optional
+        The formats to save the image in (default is "pdf").
+    dpi : int, optional
+        The resolution in dots per inch for the saved image (default is 80).
+    """
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from pathlib import Path
+
+    src_shape = [i.value for i in frag.get_input_shape()]
+    num_local_dim = frag.get_output_shape()[0].value
+    num_thread_dim = frag.get_thread_size().value
+    dst_shape = [num_local_dim, num_thread_dim]
+    num_rep = frag.replicate_size.value
+    src_flat_idx = np.zeros(dst_shape, dtype=np.int64)
+    src_idx_str = np.full(dst_shape, "", dtype="<U32")
+    if num_rep > 1:
+        for rep in range(num_rep):
+            for src_idx, item in enumerate(itertools.product(*([range(i) for i in src_shape]))):
+                th = frag.map_forward_thread([rep] + list(item))[0].value
+                dst_idx = frag.map_forward_index([rep] + list(item))[0].value
+                src_flat_idx[dst_idx, th] = src_idx
+                src_idx_str[dst_idx, th] = "(" + ",".join([str(i) for i in apply_idx_fn(*item)]) + ")"
+    else:
+        for src_idx, item in enumerate(itertools.product(*([range(i) for i in src_shape]))):
+            th = frag.map_forward_thread(item)[0].value
+            dst_idx = frag.map_forward_index(item)[0].value
+            src_flat_idx[dst_idx, th] = src_idx
+            src_idx_str[dst_idx, th] = "(" + ",".join([str(i) for i in apply_idx_fn(*item)]) + ")"
+
+    plt.figure(figsize=(item_scale * num_thread_dim, item_scale * num_local_dim))
+    cmap = plt.get_cmap(colormap)
+    plt.pcolormesh(src_flat_idx, cmap=colormap, edgecolors="k", linewidth=0.5)
+    mx = np.max(src_flat_idx) + 1
+    for i in range(num_local_dim):
+        for j in range(num_thread_dim):
+            r, g, b, a = cmap(src_flat_idx[i, j] / mx)
+            light_color = r + g + b < 1.5
+            plt.text(j + 0.5, i + 0.5, src_idx_str[i, j], ha="center", va="center", color="white" if light_color else "black")
+    plt.xticks(np.arange(num_thread_dim) + 0.5, [f"T{i}" for i in range(num_thread_dim)])
+    plt.yticks(np.arange(num_local_dim) + 0.5, [f"I{i}" for i in range(num_local_dim)])
+    plt.gca().invert_yaxis()
+    plt.gca().xaxis.tick_top()
+    plt.tight_layout()
+
+    if save_directory is not None:
+        save_dir = Path(save_directory)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        if isinstance(formats, str):
+            formats = [formats]
+        for fmt in formats:
+            plt.savefig(save_dir / f"{name}.{fmt}", bbox_inches="tight", dpi=dpi)
+        plt.close()
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index bd305b325..80a58f26c 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -7,6 +7,8 @@
 from tilelang import tvm as tvm  # noqa: F401
 from tvm.ir.transform import PassContext  # noqa: F401
 from .add_bufstore_wrapper import AddWrapperForSingleBufStore  # noqa: F401
+from .hoist_broadcast_values import HoistBroadcastValues  # noqa: F401
+from .decouple_type_cast import DecoupleTypeCast  # noqa: F401
 
 
 def get_pass_context():
@@ -92,7 +94,8 @@ def LegalizeNegativeIndex():
 
 
 def InjectAssumes():
-    """Inject Assumes
+    """Inject Assumes for natural shape boundary conditions. And convert Assumes in Evaluate(Call(...)) form
+    (tvm builtin assume call) to AttrNode form.
 
     Returns:
     -------
@@ -102,6 +105,17 @@ def InjectAssumes():
     return _ffi_api.InjectAssumes()
 
 
+def VerifyParallelLoop():
+    """VerifyParallelLoop
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.VerifyParallelLoop()  # type: ignore
+
+
 def LowerHopperIntrin():
     """LowerHopperIntrin
 
@@ -110,8 +124,7 @@ def LowerHopperIntrin():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return (_ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f
-           )  # type: ignore
+    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
 
 
 def WarpSpecializedPipeline():
@@ -190,6 +203,17 @@ def MergeIfStmt():
     return _ffi_api.MergeIfStmt()  # type: ignore
 
 
+def LoopUnswitching():
+    """LoopUnswitching: Hoist loop-invariant if statements out of loops.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.LoopUnswitching()  # type: ignore
+
+
 def MultiVersionBuffer():
     """WarpSpecializedPipeline
 
@@ -304,6 +328,21 @@ def SplitHostDevice():
     return _ffi_api.SplitHostDevice()  # type: ignore
 
 
+def AnnotateReadOnlyParams():
+    """Annotate read-only handle parameters for PrimFuncs.
+
+    Adds attribute `tl.readonly_param_indices` listing param indices that are
+    never written, enabling CUDA codegen to emit `const` qualifiers to unlock
+    read-only cache loads.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateReadOnlyParams()  # type: ignore
+
+
 def VectorizeLoop(enable_vectorize: bool = True):
     """VectorizeLoop
 
@@ -341,18 +380,6 @@ def LowerDeviceStorageAccessInfo():
     return _ffi_api.LowerDeviceStorageAccessInfo()  # type: ignore
 
 
-def LoopVectorizeDynamic():
-    """Try to vectorize loop with dynamic shape.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    ----
-    """
-    return _ffi_api.LoopVectorizeDynamic()  # type: ignore
-
-
 def ConfigIndexBitwidth():
     """Config index bitwidth.
 
@@ -377,8 +404,7 @@ def FlattenBuffer():
 
 
 def EliminateStorageSyncForMBarrier():
-    """EliminateStorageSyncForMBarrier
-    """
+    """EliminateStorageSyncForMBarrier"""
     return _ffi_api.EliminateStorageSyncForMBarrier()  # type: ignore
 
 
@@ -390,19 +416,21 @@ def MergeSharedMemoryAllocations(enable_aggressive_merge: bool = False, align_by
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge,
-                                                 align_bytes)  # type: ignore
+    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge, align_bytes)  # type: ignore
 
 
 def LowerL2Persistent():
-    """LowerL2Persistent
-    """
+    """LowerL2Persistent"""
     return _ffi_api.LowerL2Persistent()  # type: ignore
 
 
+def MarkCudaSyncCalls(have_pdl: bool = False):
+    """MarkCudaSyncCalls"""
+    return _ffi_api.MarkCudaSyncCalls(have_pdl)  # type: ignore
+
+
 def PersistThreadblock():
-    """PersistThreadblock
-    """
+    """PersistThreadblock"""
     return _ffi_api.PersistThreadblock()  # type: ignore
 
 
@@ -421,11 +449,25 @@ def AlignDynamicSharedMemoryAllocations(align_bytes: int = 16):
 
 
 def LowerSharedBarrier():
-    """LowerSharedBarrier
-    """
+    """LowerSharedBarrier"""
     return _ffi_api.LowerSharedBarrier()  # type: ignore
 
 
+def PlanAndUpdateBufferAllocationLocation():
+    """Plan and update buffer allocation locations within PrimFuncs.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
+
+
+def HoistNonRestrictParams():
+    return _ffi_api.HoistNonRestrictParams()  # type: ignore
+
+
 def StorageRewrite():
     """StorageRewrite
 
@@ -438,20 +480,17 @@ def StorageRewrite():
 
 
 def LowerOpaqueBlock():
-    """LowerOpaqueBlock
-    """
+    """LowerOpaqueBlock"""
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
 def LowerThreadAllreduce():
-    """LowerThreadAllreduce
-    """
+    """LowerThreadAllreduce"""
     return _ffi_api.LowerThreadAllreduce()  # type: ignore
 
 
 def LowerIntrin():
-    """LowerIntrin
-    """
+    """LowerIntrin"""
     return _ffi_api.LowerIntrin()  # type: ignore
 
 
@@ -469,8 +508,7 @@ def LowerDeviceKernelLaunch():
 
 
 def LowerSharedTmem():
-    """LowerSharedTmem
-    """
+    """LowerSharedTmem"""
     return _ffi_api.LowerSharedTmem()  # type: ignore
 
 
@@ -484,3 +522,43 @@ def LayoutReducer():
         The transform pass object produced by the FFI backend.
     """
     return _ffi_api.LayoutReducer()  # type: ignore
+
+
+def UnrollLoop():
+    """Unroll loops as in Halide pipeline.
+
+    This pass unrolls loops based on configuration options including:
+    - auto_max_step: Threshold of number of steps to be automatically unrolled
+    - auto_max_depth: Maximum nested level of loops that can be automatically unrolled
+    - auto_max_extent: Maximum extent of loop that will be unrolled
+    - explicit_unroll: Whether to explicitly unroll instead of setting a pragma
+    - unroll_local_access: Whether to always unroll local access
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.UnrollLoop()  # type: ignore
+
+
+def LowerLDGSTG():
+    """Lower Ramp-based global memory load/store to ldg/stg intrinsics.
+
+    This pass transforms vectorized global memory loads and stores (using Ramp indices)
+    into explicit ldg32/64/128/256 and stg32/64/128/256 intrinsics for better codegen.
+
+    Key behaviors:
+    - Converts Ramp-based global BufferLoad to ldg intrinsics
+    - Converts Ramp-based global BufferStore to stg intrinsics
+    - Supports predicated loads (if_then_else with else=0)
+    - Supports predicated stores (if in then case)
+    - Skips loads in async scope (will be lowered to cp.async)
+    - Only enabled for CUDA targets
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.LowerLDGSTG()  # type: ignore
diff --git a/tilelang/transform/add_bufstore_wrapper.py b/tilelang/transform/add_bufstore_wrapper.py
index 7ccab4707..c1dd41e0d 100644
--- a/tilelang/transform/add_bufstore_wrapper.py
+++ b/tilelang/transform/add_bufstore_wrapper.py
@@ -1,5 +1,4 @@
-from __future__ import annotations
-from tvm.tir import (BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm)
+from tvm.tir import BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm
 from tvm.tir.stmt_functor import ir_transform, post_order_visit
 from tvm.tir.transform import prim_func_pass
 
@@ -98,7 +97,7 @@ def is_tile_operation_loop(loop: For) -> bool:
             Returns:
                 True if the loop is a tile operation (parallel or has num_stages annotation)
             """
-            return loop.kind == ForKind.PARALLEL or 'num_stages' in loop.annotations
+            return loop.kind == ForKind.PARALLEL or "num_stages" in loop.annotations
 
         def pre_visit(statement):
             """
@@ -106,7 +105,7 @@ def pre_visit(statement):
             """
             nonlocal tile_operation_depth
 
-            if isinstance(statement, AttrStmt) and statement.attr_key == 'thread_extent':
+            if isinstance(statement, AttrStmt) and statement.attr_key == "thread_extent":
                 thread_binding_vars.add(statement.node.var)
             elif isinstance(statement, For) and is_tile_operation_loop(statement):
                 tile_operation_depth += 1
@@ -140,7 +139,8 @@ def post_visit(statement):
                             if isinstance(index, IntImm) and index != 0:
                                 raise ValueError(
                                     f"Fragment buffer access with non-zero index [{index}] is not supported. "
-                                    "Only fragment[0] access is allowed.")
+                                    "Only fragment[0] access is allowed."
+                                )
 
                     # Wrap fragment[0] access with T.Parallel loop
                     return For(Var("_", "int32"), 0, 1, ForKind.PARALLEL, statement)
diff --git a/tilelang/transform/decouple_type_cast.py b/tilelang/transform/decouple_type_cast.py
new file mode 100644
index 000000000..f69ab87e9
--- /dev/null
+++ b/tilelang/transform/decouple_type_cast.py
@@ -0,0 +1,606 @@
+"""
+Decouple type cast vectorization constraints.
+
+When a vectorized loop has mixed-precision operations between local and memory
+buffers, the vectorization length would be constrained by the GCD of all
+involved dtypes.
+
+This pass decouples the constraints by inserting a local buffer as an
+intermediate stage, allowing optimal vectorization for both computation and
+memory access.
+
+Two cases are handled:
+
+Case 1: local → memory (store to memory with mixed types)
+---------------------------------------------------------
+Before:
+    for vec in T.vectorized(16):
+        b[vec] = T.cast(a_frag[vec], "float4_e2m1fn")
+
+After:
+    for vec in T.vectorized(16):
+        cast_buf[vec] = T.cast(a_frag[vec], "float4_e2m1fn")  # compute
+    for vec_copy in T.vectorized(16):
+        b[vec_copy] = cast_buf[vec_copy]                      # copy to memory
+
+Case 2: memory → local (load from memory with different dtype)
+--------------------------------------------------------------
+Before:
+    for vec in T.vectorized(16):
+        a_frag[vec] = T.cast(b[vec], "float32")
+
+After:
+    for vec_copy in T.vectorized(16):
+        cast_buf[vec_copy] = b[vec_copy]                      # copy from memory
+    for vec in T.vectorized(16):
+        a_frag[vec] = T.cast(cast_buf[vec], "float32")        # compute
+"""
+
+from __future__ import annotations
+
+from tvm import tir
+from tvm.ir import Op
+from tvm.tir import (
+    Allocate,
+    Buffer,
+    BufferLoad,
+    BufferStore,
+    Call,
+    DeclBuffer,
+    For,
+    ForKind,
+    IfThenElse,
+    IntImm,
+    PrimFunc,
+    PyStmtExprVisitor,
+    SeqStmt,
+    Stmt,
+    Var,
+)
+from tvm.tir.stmt_functor import post_order_visit, substitute
+from tvm.tir.transform import prim_func_pass
+
+# Cache the Op for if_then_else to avoid repeated lookups
+_IF_THEN_ELSE_OP = Op.get("tir.if_then_else")
+
+from tilelang.utils.language import is_fragment, is_global, is_local, is_local_var, is_shared
+
+
+def is_local_buffer(buffer: Buffer) -> bool:
+    """Check if a buffer is local (register-level), including local.var."""
+    if buffer is None:
+        return False
+    return is_local(buffer) or is_fragment(buffer) or is_local_var(buffer)
+
+
+def is_global_or_shared_buffer(buffer: Buffer) -> bool:
+    """Check if a buffer is a global or shared buffer."""
+    if buffer is None:
+        return False
+    return is_global(buffer) or is_shared(buffer)
+
+
+def validate_buffer_scope(buffer: Buffer) -> None:
+    """Validate that buffer has a known scope.
+
+    Raises:
+        ValueError: If buffer scope is unknown or empty.
+    """
+    if buffer is None:
+        return
+    if not is_local_buffer(buffer) and not is_global_or_shared_buffer(buffer):
+        raise ValueError(
+            f"Unknown buffer scope '{buffer.scope()}' for buffer '{buffer.name}'. "
+            f"Expected one of: local, local.fragment, local.var, global, shared, shared.dyn"
+        )
+
+
+@tir.functor.visitor
+class MixedTypeChecker(PyStmtExprVisitor):
+    """Check if expression contains BufferLoads with different dtypes, skipping indices."""
+
+    def __init__(self, target_dtype: str):
+        super().__init__()
+        self.target_dtype = str(target_dtype)
+        self.found_different = False
+
+    def visit_buffer_load_(self, op: BufferLoad) -> None:
+        if str(op.buffer.dtype) != self.target_dtype:
+            self.found_different = True
+        # Skip indices traversal
+
+
+def has_mixed_types(expr: tir.PrimExpr, target_dtype: str) -> bool:
+    """Check if expression contains BufferLoads with different dtypes than target.
+
+    If any BufferLoad in the expression has a different dtype than the target
+    (store buffer's dtype), vectorization may be constrained by GCD of all dtypes.
+    """
+    checker = MixedTypeChecker(target_dtype)
+    checker.visit_expr(expr)
+    return checker.found_different
+
+
+@tir.functor.visitor
+class GlobalSharedBufferLoadCollector(PyStmtExprVisitor):
+    """Collect BufferLoads from global/shared buffers, skipping if_then_else conditions.
+
+    The condition part of if_then_else doesn't participate in type casting,
+    so we skip collecting BufferLoads from there.
+    """
+
+    def __init__(self, skip_if_then_else_cond: bool = False):
+        super().__init__()
+        self.result: list[BufferLoad] = []
+        self.skip_if_then_else_cond = skip_if_then_else_cond
+
+    def visit_buffer_load_(self, op: BufferLoad) -> None:
+        if is_global_or_shared_buffer(op.buffer):
+            self.result.append(op)
+
+    def visit_call_(self, op: Call) -> None:
+        if self.skip_if_then_else_cond and op.op.same_as(_IF_THEN_ELSE_OP):
+            # Skip condition (args[0]), only visit true/false values (args[1], args[2])
+            self.visit_expr(op.args[1])
+            self.visit_expr(op.args[2])
+        else:
+            # Visit all arguments normally
+            for arg in op.args:
+                self.visit_expr(arg)
+
+
+def get_global_or_shared_buffer_loads(expr: tir.PrimExpr, skip_if_then_else_cond: bool = False) -> list[BufferLoad]:
+    """Get BufferLoads from global/shared buffers in the expression.
+
+    Args:
+        expr: The expression to search.
+        skip_if_then_else_cond: If True, skip BufferLoads in if_then_else conditions,
+            since they don't participate in type casting.
+    """
+    collector = GlobalSharedBufferLoadCollector(skip_if_then_else_cond)
+    collector.visit_expr(expr)
+    return collector.result
+
+
+def has_global_or_shared_load_with_different_dtype(expr: tir.PrimExpr, target_dtype: str) -> bool:
+    """Check if expression has global/shared BufferLoad with different dtype than target.
+
+    Used to detect memory→local cases where we need to insert cast buffer.
+    Skips if_then_else condition since it doesn't participate in type casting.
+    """
+    target_dtype = str(target_dtype)
+    return any(str(load.buffer.dtype) != target_dtype for load in get_global_or_shared_buffer_loads(expr, skip_if_then_else_cond=True))
+
+
+@tir.functor.visitor
+class StoreCollector(PyStmtExprVisitor):
+    """Collect BufferStore nodes that need transformation, skipping indices traversal.
+
+    This avoids visiting BufferLoad/BufferStore nodes inside indices, which don't
+    participate in the type casting transformation.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.local_to_memory: list[BufferStore] = []
+        self.memory_to_local: list[BufferStore] = []
+
+    def visit_buffer_store_(self, op: BufferStore) -> None:
+        validate_buffer_scope(op.buffer)
+        # Case 1: store to memory with mixed types
+        if is_global_or_shared_buffer(op.buffer) and has_mixed_types(op.value, op.buffer.dtype):
+            self.local_to_memory.append(op)
+        # Case 2: store to local with memory load of different dtype
+        elif is_local_buffer(op.buffer) and has_global_or_shared_load_with_different_dtype(op.value, op.buffer.dtype):
+            self.memory_to_local.append(op)
+        # Only visit value, skip indices
+        self.visit_expr(op.value)
+
+    def visit_buffer_load_(self, op: BufferLoad) -> None:
+        # Skip indices traversal for BufferLoad as well
+        pass
+
+
+def contains_seq_stmt(stmt: Stmt) -> bool:
+    """Check if statement contains SeqStmt (multiple statements).
+
+    When the For body has SeqStmt, the transformation is more complex
+    and we skip the optimization for now.
+    """
+    found = False
+
+    def visitor(node) -> None:
+        nonlocal found
+        if isinstance(node, SeqStmt):
+            found = True
+
+    post_order_visit(stmt, visitor)
+    return found
+
+
+def extract_if_condition(stmt: Stmt) -> tuple[tir.PrimExpr | None, Stmt]:
+    """Extract IfThenElse condition from statement if present.
+
+    Returns:
+        A tuple of (condition, inner_body). If no IfThenElse, returns (None, stmt).
+    """
+    if isinstance(stmt, IfThenElse) and stmt.else_case is None:
+        return stmt.condition, stmt.then_case
+    return None, stmt
+
+
+# Type alias for cast buffer mapping
+# Maps original buffer -> (cast buffer, original indices)
+CastBufferMap = dict[Buffer, tuple[Buffer, list[tir.PrimExpr]]]
+
+
+@tir.functor.mutator
+class DecoupleTypeCastMutator(tir.PyStmtExprMutator):
+    """Mutator that decouples type cast vectorization constraints.
+
+    This mutator transforms vectorized loops that store to memory buffers
+    (global/shared) with mixed-precision expressions by inserting local
+    cache buffers as intermediate stages.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._var_counter = 0
+
+    def _make_unique_name(self, base: str) -> str:
+        """Generate a unique name with incrementing counter."""
+        name = f"{base}"
+        if self._var_counter > 0:
+            name += f"_{self._var_counter}"
+        self._var_counter += 1
+        return name
+
+    def _make_for(self, original: For, new_body: Stmt) -> For:
+        """Create a new For node with updated body, preserving other attributes."""
+        return For(
+            original.loop_var,
+            original.min,
+            original.extent,
+            original.kind,
+            new_body,
+            original.thread_binding,
+            original.annotations,
+            original.step,
+        )
+
+    def visit_for_(self, op: For) -> Stmt:
+        """Visit For nodes, transforming vectorized loops with mixed-type stores."""
+        # Recursively visit body to handle nested loops
+        new_body = self.visit_stmt(op.body)
+
+        # Only transform vectorized loops
+        if op.kind != ForKind.VECTORIZED:
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        # Skip transformation for complex cases with multiple statements
+        # Currently we only handle simple single BufferStore cases
+        if contains_seq_stmt(new_body):
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        # Collect stores that need transformation
+        local_to_memory, memory_to_local = self._collect_stores_to_transform(new_body)
+        if local_to_memory:
+            return self._transform_local_to_memory(op, local_to_memory)
+        elif memory_to_local:
+            return self._transform_memory_to_local(op, memory_to_local)
+        else:
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+    def _collect_stores_to_transform(self, stmt: Stmt) -> tuple[list[BufferStore], list[BufferStore]]:
+        """Collect BufferStore nodes that need local cast buffer insertion.
+
+        Returns two lists:
+        1. local_to_memory: stores to memory buffer with mixed-type values
+           (compute → cast buffer → copy to memory)
+        2. memory_to_local: stores to local buffer with memory buffer loads of different dtype
+           (copy from memory → cast buffer → compute)
+
+        Note: Vectorized for is always the innermost loop, so no nested For handling needed.
+        """
+        collector = StoreCollector()
+        collector.visit_stmt(stmt)
+        return collector.local_to_memory, collector.memory_to_local
+
+    def _transform_local_to_memory(self, op: For, stores_to_transform: list[BufferStore]) -> Stmt:
+        """Transform local→memory: compute to cast buffer, then copy to memory.
+
+        Before:
+            b[i] = cast(a_frag[i], fp4)
+
+        After:
+            cast_buf[i] = cast(a_frag[i], fp4)  # compute to cast buffer
+            b[i] = cast_buf[i]                   # copy to memory
+        """
+        # Skip dynamic extents
+        if not isinstance(op.extent, IntImm):
+            return op
+
+        # Extract condition if the body is wrapped in IfThenElse
+        condition, _ = extract_if_condition(op.body)
+
+        # Create cast buffers for each unique target buffer (memory buffer)
+        cast_buffers = self._create_cast_buffers_for_stores(stores_to_transform, op.extent.value)
+
+        # Build compute loop (stores to local cast buffer)
+        compute_body = self._replace_stores_with_cast(op.body, cast_buffers, op.loop_var)
+        compute_loop = self._make_vectorized_loop(op, compute_body)
+
+        # Build copy loops (transfer from cast buffer to memory, with condition if present)
+        copy_loops = self._create_copy_loops_to_memory(op, cast_buffers, condition)
+
+        # Combine: compute → copy
+        all_stmts = [compute_loop] + copy_loops
+        result: Stmt = SeqStmt(all_stmts) if len(all_stmts) > 1 else all_stmts[0]
+
+        # Wrap with buffer declarations and allocations
+        result = self._wrap_with_allocations(result, cast_buffers)
+
+        return result
+
+    def _transform_memory_to_local(self, op: For, stores_to_transform: list[BufferStore]) -> Stmt:
+        """Transform memory→local: copy from memory to cast buffer, then compute.
+
+        Before:
+            a_frag[i] = cast(b[i], fp32)
+
+        After:
+            cast_buf[i] = b[i]                   # copy from memory to cast buffer
+            a_frag[i] = cast(cast_buf[i], fp32)  # compute from cast buffer
+        """
+        # Skip dynamic extents
+        if not isinstance(op.extent, IntImm):
+            return op
+
+        # Extract condition if the body is wrapped in IfThenElse
+        condition, _ = extract_if_condition(op.body)
+
+        # Collect memory buffer loads that need cast buffering
+        memory_loads = self._collect_memory_loads_to_cast(stores_to_transform)
+        if not memory_loads:
+            return op
+
+        # Create cast buffers for each unique source buffer (memory buffer)
+        cast_buffers = self._create_cast_buffers_for_loads(memory_loads, op.extent.value)
+
+        # Build copy loops (transfer from memory to cast buffer, with condition if present)
+        copy_loops = self._create_copy_loops_from_memory(op, cast_buffers, condition)
+
+        # Build compute loop (replace memory loads with cast buffer loads)
+        compute_body = self._replace_loads_with_cast(op.body, cast_buffers, op.loop_var)
+        compute_loop = self._make_vectorized_loop(op, compute_body)
+
+        # Combine: copy → compute
+        all_stmts = copy_loops + [compute_loop]
+        result: Stmt = SeqStmt(all_stmts) if len(all_stmts) > 1 else all_stmts[0]
+
+        # Wrap with buffer declarations and allocations
+        result = self._wrap_with_allocations(result, cast_buffers)
+
+        return result
+
+    def _collect_memory_loads_to_cast(self, stores: list[BufferStore]) -> list[BufferLoad]:
+        """Collect memory BufferLoads from store values that need cast buffering."""
+        result: list[BufferLoad] = []
+        seen_buffers = set()
+        for store in stores:
+            for load in get_global_or_shared_buffer_loads(store.value, skip_if_then_else_cond=True):
+                if load.buffer not in seen_buffers:
+                    result.append(load)
+                    seen_buffers.add(load.buffer)
+        return result
+
+    def _create_cast_buffers_for_stores(self, stores: list[BufferStore], extent: int) -> CastBufferMap:
+        """Create local cast buffers for store targets (memory buffers)."""
+        cast_buffers: CastBufferMap = {}
+
+        for store in stores:
+            if store.buffer in cast_buffers:
+                continue
+
+            cache_name = self._make_unique_name(f"{store.buffer.name}_local_cast")
+            cast_buffer = tir.decl_buffer(
+                shape=(extent,),
+                dtype=store.buffer.dtype,
+                name=cache_name,
+                scope="local",
+            )
+            cast_buffers[store.buffer] = (cast_buffer, list(store.indices))
+
+        return cast_buffers
+
+    def _create_cast_buffers_for_loads(self, loads: list[BufferLoad], extent: int) -> CastBufferMap:
+        """Create local cast buffers for load sources (memory buffers)."""
+        cast_buffers: CastBufferMap = {}
+
+        for load in loads:
+            if load.buffer in cast_buffers:
+                continue
+
+            cache_name = self._make_unique_name(f"{load.buffer.name}_local_cast")
+            cast_buffer = tir.decl_buffer(
+                shape=(extent,),
+                dtype=load.buffer.dtype,
+                name=cache_name,
+                scope="local",
+            )
+            cast_buffers[load.buffer] = (cast_buffer, list(load.indices))
+
+        return cast_buffers
+
+    def _make_vectorized_loop(self, original: For, body: Stmt) -> For:
+        """Create a vectorized For loop based on the original."""
+        return For(
+            original.loop_var,
+            original.min,
+            original.extent,
+            ForKind.VECTORIZED,
+            body,
+            original.thread_binding,
+            original.annotations,
+            original.step,
+        )
+
+    def _create_copy_loops_to_memory(self, op: For, cast_buffers: CastBufferMap, condition: tir.PrimExpr | None = None) -> list[For]:
+        """Create copy loops to transfer data from cast buffers to memory buffers."""
+        copy_loops: list[For] = []
+
+        for orig_buffer, (cast_buffer, orig_indices) in cast_buffers.items():
+            # vectorized loop only has one iteration variable, so we use the same name for the copy variable
+            copy_var = Var(f"{op.loop_var.name}_copy", op.loop_var.dtype)
+
+            # Substitute loop_var with copy_var in original indices
+            new_indices = [substitute(idx, {op.loop_var: copy_var}) for idx in orig_indices]
+
+            # cast buffer → memory
+            copy_store: Stmt = BufferStore(
+                orig_buffer,
+                BufferLoad(cast_buffer, [copy_var]),
+                new_indices,
+            )
+
+            # Wrap with condition if present (substitute loop_var with copy_var)
+            if condition is not None:
+                new_condition = substitute(condition, {op.loop_var: copy_var})
+                copy_store = IfThenElse(new_condition, copy_store, None)
+
+            copy_loop = For(
+                copy_var,
+                op.min,
+                op.extent,
+                ForKind.VECTORIZED,
+                copy_store,
+                op.thread_binding,
+                op.annotations,
+                op.step,
+            )
+            copy_loops.append(copy_loop)
+
+        return copy_loops
+
+    def _create_copy_loops_from_memory(self, op: For, cast_buffers: CastBufferMap, condition: tir.PrimExpr | None = None) -> list[For]:
+        """Create copy loops to transfer data from memory buffers to cast buffers."""
+        copy_loops: list[For] = []
+
+        for orig_buffer, (cast_buffer, orig_indices) in cast_buffers.items():
+            # vectorized loop only has one iteration variable, so we use the same name for the copy variable
+            copy_var = Var(f"{op.loop_var.name}_copy", op.loop_var.dtype)
+
+            # Substitute loop_var with copy_var in original indices
+            new_indices = [substitute(idx, {op.loop_var: copy_var}) for idx in orig_indices]
+
+            # memory → cast buffer
+            copy_store: Stmt = BufferStore(
+                cast_buffer,
+                BufferLoad(orig_buffer, new_indices),
+                [copy_var],
+            )
+
+            # Wrap with condition if present (substitute loop_var with copy_var)
+            if condition is not None:
+                new_condition = substitute(condition, {op.loop_var: copy_var})
+                copy_store = IfThenElse(new_condition, copy_store, None)
+
+            copy_loop = For(
+                copy_var,
+                op.min,
+                op.extent,
+                ForKind.VECTORIZED,
+                copy_store,
+                op.thread_binding,
+                op.annotations,
+                op.step,
+            )
+            copy_loops.append(copy_loop)
+
+        return copy_loops
+
+    def _wrap_with_allocations(self, body: Stmt, cast_buffers: CastBufferMap) -> Stmt:
+        """Wrap statement with buffer declarations and allocations."""
+        result = body
+        for cast_buffer, _ in cast_buffers.values():
+            result = DeclBuffer(cast_buffer, result)
+            result = Allocate(
+                cast_buffer.data,
+                cast_buffer.dtype,
+                cast_buffer.shape,
+                tir.const(True),
+                result,
+            )
+        return result
+
+    def _replace_stores_with_cast(self, stmt: Stmt, cast_buffers: CastBufferMap, loop_var: Var) -> Stmt:
+        """Replace stores to memory buffers with stores to cast buffers."""
+        store_replacer = StoreReplacer(cast_buffers, loop_var)
+        return store_replacer.visit_stmt(stmt)
+
+    def _replace_loads_with_cast(self, stmt: Stmt, cast_buffers: CastBufferMap, loop_var: Var) -> Stmt:
+        """Replace loads from memory buffers with loads from cast buffers.
+
+        This method recursively processes the statement tree, replacing
+        BufferLoad nodes from cast buffers with loads from the cast buffer.
+        """
+        # Create an expression mutator to replace BufferLoads
+        load_replacer = LoadReplacer(cast_buffers, loop_var)
+        return load_replacer.visit_stmt(stmt)
+
+
+@tir.functor.mutator
+class StoreReplacer(tir.PyStmtExprMutator):
+    """Mutator to replace memory BufferStores with cast buffer BufferStores."""
+
+    def __init__(self, cast_buffers: CastBufferMap, loop_var: Var):
+        super().__init__()
+        self.cast_buffers = cast_buffers
+        self.loop_var = loop_var
+
+    def visit_buffer_store_(self, op: BufferStore) -> Stmt:
+        if op.buffer in self.cast_buffers:
+            cast_buffer, _ = self.cast_buffers[op.buffer]
+            return BufferStore(cast_buffer, op.value, [self.loop_var])
+        return op
+
+
+@tir.functor.mutator
+class LoadReplacer(tir.PyStmtExprMutator):
+    """Mutator to replace memory BufferLoads with cast buffer BufferLoads."""
+
+    def __init__(self, cast_buffers: CastBufferMap, loop_var: Var):
+        super().__init__()
+        self.cast_buffers = cast_buffers
+        self.loop_var = loop_var
+
+    def visit_buffer_load_(self, op: BufferLoad) -> tir.PrimExpr:
+        if op.buffer in self.cast_buffers:
+            cast_buffer, _ = self.cast_buffers[op.buffer]
+            return BufferLoad(cast_buffer, [self.loop_var])
+        return op
+
+
+def DecoupleTypeCast():
+    """Create a TVM pass that decouples type cast vectorization constraints.
+
+    This pass inserts a local buffer as an intermediate stage for vectorized
+    stores to non-local buffers (global/shared) where the store value contains
+    expressions with different dtypes.
+
+    This allows optimal vectorization for both computation and memory access.
+
+    Note:
+        This pass must be applied before VectorizeLoop and StorageRewrite passes,
+        while the IR still uses BufferLoad/BufferStore (not tvm_access_ptr).
+
+    Returns:
+        A TVM PrimFunc pass.
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        mutator = DecoupleTypeCastMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/hoist_broadcast_values.py b/tilelang/transform/hoist_broadcast_values.py
new file mode 100644
index 000000000..91073ecbf
--- /dev/null
+++ b/tilelang/transform/hoist_broadcast_values.py
@@ -0,0 +1,129 @@
+from tvm import tir
+from tvm.tir import (
+    BufferStore,
+    LetStmt,
+    Broadcast,
+    Var,
+    PrimFunc,
+    PyStmtExprMutator,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+@tir.functor.mutator
+class HoistBroadcastValuesMutator(PyStmtExprMutator):
+    def __init__(self):
+        super().__init__()
+        # Temporary queue: used to store variables that need to be defined within the current statement.
+        self.pending_defs = []
+        # Flag to indicate if hoist should be enabled.
+        self.hoist_enabled = False
+
+    def visit_broadcast_(self, op):
+        if self.hoist_enabled and isinstance(op.value, (tir.IntImm, tir.FloatImm)):
+            # 1. Intercept Broadcast nodes.
+            # Extract the value to be hoisted into a variable.
+            val = self.visit_expr(op.value)
+            # 2. Create a new variable.
+            new_var = Var("broadcast_var", dtype=val.dtype)
+
+            # 3. Add the (variable, value) pair to the pending queue.
+            # Note: Do not create the LetStmt here; it must wrap the statement.
+            self.pending_defs.append((new_var, val))
+
+            # 4. Return a new Broadcast node, using the new variable to replace the original value.
+            return Broadcast(new_var, op.lanes)
+        return Broadcast(self.visit_expr(op.value), self.visit_expr(op.lanes))
+
+    # Intercept statement types that might contain expressions with broadcasts.
+    # Currently handled: BufferStore, LetStmt.
+    def visit_buffer_store_(self, op: BufferStore):
+        # 1. Save the current state to handle nested statements correctly.
+        saved_hoist_enabled = self.hoist_enabled
+        saved_pending_defs = self.pending_defs
+
+        # 2. Enable hoist flag and clear the pending queue for the current statement context.
+        self.hoist_enabled = True
+        self.pending_defs = []
+
+        # 3. Visit child nodes normally (this will trigger visit_broadcast_).
+        new_indices = [self.visit_expr(idx) for idx in op.indices]
+        new_stmt = BufferStore(op.buffer, self.visit_expr(op.value), new_indices)
+
+        # 4. Check if there are variables waiting to be defined.
+        if self.pending_defs:
+            # 5. Wrap the current statement with LetStmt.
+            # Order: Traverse in reverse to ensure the first definition wraps the outermost layer.
+            # Structure generated: Let my_var = val In BufferStore(...)
+            for var, val in reversed(self.pending_defs):
+                new_stmt = LetStmt(var, val, new_stmt)
+
+        # 6. Restore the saved state.
+        self.hoist_enabled = saved_hoist_enabled
+        self.pending_defs = saved_pending_defs
+
+        return new_stmt
+
+    def visit_let_stmt_(self, op: LetStmt):
+        # 1. Save the current state to handle nested statements correctly.
+        saved_hoist_enabled = self.hoist_enabled
+        saved_pending_defs = self.pending_defs
+
+        # 2. Enable hoist flag and clear the pending queue for the current statement context.
+        self.hoist_enabled = True
+        self.pending_defs = []
+
+        # 3. Visit the value expression (this will trigger visit_broadcast_).
+        new_value = self.visit_expr(op.value)
+
+        # 4. Capture the pending defs from the value expression before visiting body.
+        value_pending_defs = self.pending_defs
+
+        # 5. Disable hoist flag and clear pending defs before visiting body.
+        self.hoist_enabled = False
+        self.pending_defs = []
+
+        # 6. Recursively visit the body.
+        new_body = self.visit_stmt(op.body)
+
+        # 7. Create the new LetStmt.
+        new_stmt = LetStmt(op.var, new_value, new_body)
+
+        # 8. Check if there are variables waiting to be defined from the value expression.
+        if value_pending_defs:
+            # 9. Wrap the current statement with LetStmt.
+            for var, val in reversed(value_pending_defs):
+                new_stmt = LetStmt(var, val, new_stmt)
+
+        # 10. Restore the saved state.
+        self.hoist_enabled = saved_hoist_enabled
+        self.pending_defs = saved_pending_defs
+
+        return new_stmt
+
+
+def HoistBroadcastValues():
+    """
+    TVM Pass: HoistBroadcastValues.
+
+    This pass scans the TIR for Broadcast operations involving immediate constants (IntImm, FloatImm).
+    It extracts these constants into variables defined via LetStmt immediately surrounding
+    the statement where the broadcast occurs.
+
+    Example Transformation:
+    -----------------------
+    Before:
+        A[i] = B[i] + T.Broadcast(3.14, 4) + T.Broadcast(3.14, 4)
+
+    After:
+        bv_3_14 = 3.14
+        bv_3_14_1 = 3.14
+        A[i] = B[i] + T.Broadcast(bv_3_14, 4) + T.Broadcast(bv_3_14_1, 4)
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        mutator = HoistBroadcastValuesMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/metal/__init__.py b/tilelang/transform/metal/__init__.py
new file mode 100644
index 000000000..cdcb106ca
--- /dev/null
+++ b/tilelang/transform/metal/__init__.py
@@ -0,0 +1 @@
+from .mark_host_metal_context import MarkHostMetalContext  # noqa: F401
diff --git a/tilelang/transform/metal/mark_host_metal_context.py b/tilelang/transform/metal/mark_host_metal_context.py
new file mode 100644
index 000000000..f0203ccf0
--- /dev/null
+++ b/tilelang/transform/metal/mark_host_metal_context.py
@@ -0,0 +1,57 @@
+from tvm.ir import Op
+from tvm.tir import (
+    PyStmtExprMutator,
+    functor,
+    Evaluate,
+    AttrStmt,
+)
+from tvm.tir.transform import prim_func_pass
+
+"""
+Transformation pass to mark host-side kernel calls for Metal/MPS synchronization.
+
+To execute TVM-generated Metal kernels within a PyTorch environment, the TVM runtime
+must utilize PyTorch's active Metal command buffer (MPS). This ensures correct
+execution ordering and memory consistency between PyTorch operators and TVM kernels.
+
+This pass identifies calls to `tir.tvm_call_packed_lowered` occurring within a
+`compute_scope` and wraps them with a `metal_context` attribute. This attribute
+signals the downstream host C codegen to inject specific runtime logic that:
+1. Retrieves the current command buffer from `torch::mps`.
+2. Passes this stream to the TVM runtime before the kernel executes.
+"""
+
+
+tvm_call_packed_lowered = Op.get("tir.tvm_call_packed_lowered")
+
+
+@functor.mutator
+class MarkHostMetalContextMutator(PyStmtExprMutator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_in_compute_scope = False
+
+    def visit_attr_stmt_(self, stmt):
+        switch = stmt.attr_key == "compute_scope"
+        old_value = False
+        if switch:
+            assert not self.is_in_compute_scope
+            old_value, self.is_in_compute_scope = self.is_in_compute_scope, True
+        s = self.visit_stmt(stmt.body)
+        if switch:
+            self.is_in_compute_scope = old_value
+        return s
+
+    def visit_evaluate_(self, op: Evaluate):
+        if self.is_in_compute_scope and op.value.op.same_as(tvm_call_packed_lowered):
+            return AttrStmt(0, "metal_context", "", op)
+        return op
+
+
+def MarkHostMetalContext():
+    def pass_fn(func, mod, ctx):
+        mutator = MarkHostMetalContextMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index a1edb881d..945fe73a5 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -5,24 +5,51 @@
 
 class PassConfigKey(str, Enum):
     """Pass configuration keys for TileLang compiler."""
+
     # TileLang specific configs
     TL_SIMPLIFY = "tl.Simplify"
-    """Enable/disable TileLang simplification passes. Default: True"""
+    """Configuration for TileLang simplification passes.
+
+    This is a dict-based config with the following options:
+    - transitively_prove_inequalities: bool, default False
+    - convert_boolean_to_and_of_ors: bool, default False
+    - apply_constraints_to_boolean_branches: bool, default False
+    - propagate_knowns_to_prove_conditional: bool, default False
+    - propagate_knowns_to_simplify_expressions: bool, default False
+    - enable_simplify_let_inline: bool, default True
+
+    Usage:
+        with tvm.transform.PassContext(config={
+            "tl.Simplify": {"enable_simplify_let_inline": False}
+        }):
+            mod = tl.transform.Simplify()(mod)
+    """
+
+    # TL_SIMPLIFY sub-config keys
+    TL_SIMPLIFY_TRANSITIVELY_PROVE_INEQUALITIES = "transitively_prove_inequalities"
+    """Enable transitive inequality proving in simplification. Default: False"""
+
+    TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS = "convert_boolean_to_and_of_ors"
+    """Convert boolean expressions to AND of ORs form. Default: False"""
+
+    TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES = "apply_constraints_to_boolean_branches"
+    """Apply constraints to simplify boolean branches. Default: False"""
 
-    TL_DYNAMIC_ALIGNMENT = "tl.dynamic_alignment"
-    """Memory alignment requirement for dynamic shapes. Default: 16"""
+    TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_PROVE_CONDITIONAL = "propagate_knowns_to_prove_conditional"
+    """Propagate known values to prove conditionals. Default: False"""
 
-    TL_DISABLE_DYNAMIC_TAIL_SPLIT = "tl.disable_dynamic_tail_split"
-    """Disable dynamic tail splitting optimization. Default: False"""
+    TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS = "propagate_knowns_to_simplify_expressions"
+    """Propagate known values to simplify expressions. Default: False"""
+
+    TL_SIMPLIFY_ENABLE_LET_INLINE = "enable_simplify_let_inline"
+    """Enable inlining of let statements during simplification. Default: True"""
+
+    TL_DISABLE_DATA_RACE_CHECK = "tl.disable_data_race_check"
+    """Disable data race check in TileLang. Default: False"""
 
     TL_DISABLE_WARP_SPECIALIZED = "tl.disable_warp_specialized"
     """Disable warp specialization optimization. Default: False"""
 
-    TL_DISABLE_FAST_MATH = "tl.disable_fast_math"
-    """Disable fast math optimization. Default: True
-    will be deprecated in the 0.1.7 release
-    """
-
     TL_ENABLE_FAST_MATH = "tl.enable_fast_math"
     """
         Enable fast math optimization. Default: False
@@ -36,6 +63,20 @@ class PassConfigKey(str, Enum):
     TL_ENABLE_PTXAS_VERBOSE_OUTPUT = "tl.enable_ptxas_verbose_output"
     """Enable ptxas verbose output. Default: False"""
 
+    TL_DEVICE_COMPILE_FLAGS = "tl.device_compile_flags"
+    """Additional device compiler flags passed to nvcc/NVRTC.
+
+    Accepts either a string (parsed with shell-like splitting) or a list of
+    strings. Typical usage is to provide extra include paths, defines or
+    ptxas options, e.g.:
+
+    - "-I/opt/include -DMY_SWITCH=1 --ptxas-options=--verbose"
+    - ["-I/opt/include", "-DMY_SWITCH=1", "--ptxas-options=--verbose"]
+
+    These flags are appended to the compiler options used in the tvm_ffi
+    CUDA compile callback. Default: None
+    """
+
     TL_CONFIG_INDEX_BITWIDTH = "tl.config_index_bitwidth"
     """Bitwidth for configuration indices. Default: 32"""
 
@@ -47,6 +88,24 @@ class PassConfigKey(str, Enum):
 
     TL_DISABLE_VECTORIZE_256 = "tl.disable_vectorize_256"
     """Disable usage of LDG/STG 256. Default: False"""
+
+    TL_ENABLE_LOWER_LDGSTG = "tl.enable_lower_ldgstg"
+    """Enable non-predicated LDG/STG lowering for global memory access.
+    When enabled, converts Ramp-based global buffer load/store to ldg/stg intrinsics.
+    Default: False"""
+
+    TL_ENABLE_LOWER_LDGSTG_PREDICATED = "tl.enable_lower_ldgstg_predicated"
+    """Enable predicated LDG/STG lowering.
+    When True, predicated loads (if_then_else with else=0) and
+    predicated stores (IfThenElse with empty then case) are lowered to
+    ldg/stg intrinsics. Default: False"""
+
+    TL_ENABLE_VECTORIZE_PLANNER_VERBOSE = "tl.enable_vectorize_planner_verbose"
+    """Enable verbose output for vectorize planner. When enabled, prints detailed
+    information about each buffer's inferred vector size and which buffer determines
+    the final vectorization factor. Useful for debugging vectorization issues.
+    Default: False"""
+
     TL_DISABLE_WGMMA = "tl.disable_wgmma"
     """Disable usage of Hopper WGMMA. Default: False"""
 
@@ -59,6 +118,15 @@ class PassConfigKey(str, Enum):
     TL_DISABLE_SHUFFLE_ELECT = "tl.disable_shuffle_elect"
     """Disable shuffle election optimization. Default: False"""
 
+    TL_DISABLE_LOOP_UNSWITCHING = "tl.disable_loop_unswitching"
+    """Disable loop unswitching optimization. Default: False"""
+
+    TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE = "tl.loop_unswitching_allow_non_trivial_else"
+    """Allow loop unswitching even when the else-version of the loop body has side effects.
+
+    This is more aggressive and may increase code size. Default: False.
+    """
+
     TL_DISABLE_THREAD_STORAGE_SYNC = "tl.disable_thread_storage_sync"
     """Disable thread storage synchronization pass. When enabled, disables the
     automatic insertion of thread synchronization barriers (e.g., __syncthreads())
@@ -69,6 +137,18 @@ class PassConfigKey(str, Enum):
     TL_FORCE_LET_INLINE = "tl.force_let_inline"
     """Force TileLang to inline let bindings during simplification. Default: False"""
 
+    TL_AST_PRINT_ENABLE = "tl.ast_print_enable"
+    """Enable TIR AST printing for debugging purposes. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_ENABLE = "tl.layout_visualization_enable"
+    """Enable layout inference visualization. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_FORMATS = "tl.layout_visualization_formats"
+    """Layout visualization formats.
+    Acceptable values: "pdf", "png", "svg", "all"
+
+    """
+
     TL_STORAGE_REWRITE_DETECT_INPLACE = "tl.storage_rewrite_detect_inplace"
     """Control StorageRewrite inplace detection.
 
@@ -76,10 +156,10 @@ class PassConfigKey(str, Enum):
     such as `dst[i] = f(src[i])`, avoiding implicit aliasing:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    write = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
-    write_buf = T.Buffer((1,), "int32", data=write, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    write = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
+    write_buf = T.Buffer((1,), T.int32, data=write, scope="local.var")
     write_buf[0] = read_buf[0] * 2
     f(write_buf[0])
     ```
@@ -89,8 +169,8 @@ class PassConfigKey(str, Enum):
     like:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
     read_buf[0] = read_buf[0] * 2
     f(read_buf[0])
     ```
@@ -142,3 +222,6 @@ class PassConfigKey(str, Enum):
 
     CUDA_KERNELS_OUTPUT_DIR = "cuda.kernels_output_dir"
     """Output directory for generated CUDA kernels. Default: empty string"""
+
+    TL_DISABLE_OUT_OF_BOUND_WARNING = "tl.disable_out_of_bound_warning"
+    """Disable out-of-bound access warnings in safe memory access legalization. Default: False"""
diff --git a/tilelang/transform/simplify.py b/tilelang/transform/simplify.py
index 7e0c5062b..c5e577d03 100644
--- a/tilelang/transform/simplify.py
+++ b/tilelang/transform/simplify.py
@@ -51,7 +51,6 @@ def _Simplify(stmt: PrimFunc | IRModule, inline_let: bool = False) -> PrimFunc |
 
 # Decorator to simplify the output of a function
 def simplify_prim_func(func: Callable) -> Callable:
-
     def wrapper(*args, **kwargs):
         stmt: PrimFunc | IRModule = (func)(*args, **kwargs)
         return _Simplify(stmt)
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index e13905f82..dacd29a95 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -1,6 +1,10 @@
 """The profiler and convert to torch utils"""
 
-from .target import determine_target  # noqa: F401
+from .target import (  # noqa: F401
+    determine_target,
+    determine_fp8_type,
+    determine_torch_fp8_type,
+)
 from .tensor import TensorSupplyType, torch_assert_close, map_torch_type  # noqa: F401
 from .language import (
     is_global,  # noqa: F401
@@ -15,5 +19,9 @@
     retrive_ptr_from_buffer_region,  # noqa: F401
     is_full_region,  # noqa: F401
     to_buffer_region,  # noqa: F401
+    get_buffer_region_from_load,  # noqa: F401
+    get_prim_func_name,  # noqa: F401
+    side_effect,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
+from .version import build_date  # noqa: F401
diff --git a/tilelang/utils/deprecated.py b/tilelang/utils/deprecated.py
index 2aff08b59..2944f292b 100644
--- a/tilelang/utils/deprecated.py
+++ b/tilelang/utils/deprecated.py
@@ -1,11 +1,10 @@
 def deprecated_warning(method_name: str, new_method_name: str, phaseout_version: str = None):
-    """A function to indicate that a method is deprecated
-    """
+    """A function to indicate that a method is deprecated"""
     import warnings  # pylint: disable=import-outside-toplevel, import-error
 
     warnings.warn(
-        f"{method_name} is deprecated, use {new_method_name} instead" +
-        (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
+        f"{method_name} is deprecated, use {new_method_name} instead"
+        + (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
         DeprecationWarning,
         stacklevel=2,
     )
@@ -30,7 +29,6 @@ def deprecated(
     import functools  # pylint: disable=import-outside-toplevel
 
     def _deprecate(func):
-
         @functools.wraps(func)
         def _wrapper(*args, **kwargs):
             deprecated_warning(method_name, new_method_name, phaseout_version)
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index caf90abc1..c0bcb6209 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,15 +1,18 @@
 from __future__ import annotations
+from tilelang._typing import BufferLikeType
 from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
+from tilelang.language.utils import region as _make_region_call
+from tilelang.language.utils import get_buffer_region_from_load
 from functools import reduce
-from tvm import IRModule
+from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
 from tvm import ir, tir
-
+from tvm.tir.expr import CallEffectKind
 # Scope Checkers for TVM Buffers
 # These utility functions check the memory scope of a given TVM buffer.
 
 
-def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> Buffer:
+def _get_buffer(buffer_or_load_or_region: BufferLikeType) -> Buffer:
     """
     Extract Buffer from Buffer, BufferLoad, or BufferRegion.
 
@@ -24,11 +27,10 @@ def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) ->
     elif isinstance(buffer_or_load_or_region, (tir.BufferLoad, tir.BufferRegion)):
         return buffer_or_load_or_region.buffer
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
-def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_global(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the global memory scope.
 
@@ -42,7 +44,7 @@ def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "global"
 
 
-def is_shared(buffer: Buffer | BufferLoad | BufferRegion, allow_dynamic: bool = True) -> bool:
+def is_shared(buffer: BufferLikeType, allow_dynamic: bool = True) -> bool:
     """
     Check if the buffer is in the shared memory scope.
 
@@ -60,7 +62,7 @@ def is_shared(buffer: Buffer | BufferLoad | BufferRegion, allow_dynamic: bool =
     return any(conditions)
 
 
-def is_shared_dynamic(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_shared_dynamic(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the dynamic shared memory scope.
 
@@ -74,7 +76,7 @@ def is_shared_dynamic(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "shared.dyn"
 
 
-def is_tensor_memory(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_tensor_memory(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in tensor memory scope (e.g., shared.tmem).
 
@@ -88,7 +90,7 @@ def is_tensor_memory(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope().startswith("shared.tmem")
 
 
-def is_local(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_local(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the local memory scope.
 
@@ -102,7 +104,7 @@ def is_local(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "local"
 
 
-def is_fragment(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_fragment(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is a fragment (e.g., for matrix multiplication operations).
 
@@ -116,6 +118,20 @@ def is_fragment(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope().startswith("local.fragment")
 
 
+def is_local_var(buffer: BufferLikeType) -> bool:
+    """
+    Check if the buffer is in the local.var memory scope.
+
+    Args:
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
+
+    Returns:
+        bool: True if the buffer is in local.var memory, False otherwise.
+    """
+    buffer = _get_buffer(buffer)
+    return buffer.scope() == "local.var"
+
+
 def get_buffer_elems(buffer: Buffer) -> int:
     """
     Get the number of elements in the buffer.
@@ -153,65 +169,56 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     """
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion | None:
+def to_buffer_region(obj: BufferLikeType, access_type: str = "rw", extents: list[PrimExpr] | None = None) -> PrimExpr | BufferRegion:
     """
-    Get the buffer region from a buffer load.
-
-    May encounter buffer load like C[0:128, 0:32], ref to pull request
-    for buffer wise op: https://github.com/apache/tvm/pull/14693
-    convert load to region
-    """
-    buffer, indices = buffer_load.buffer, buffer_load.indices
-    regions = []
-    found_ramp: bool = False
-    for indice in indices:
-        if isinstance(indice, tir.Ramp):
-            regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
-            found_ramp = True
-        elif isinstance(indice, tir.PrimExpr):
-            regions.append(ir.Range.from_min_extent(indice, 1))
-        else:
-            raise ValueError("Unsupported type: ", type(indice))
-    if found_ramp:
-        return tir.BufferRegion(buffer, regions)
-    else:
-        return None
-
+    Convert to/from the tl.region representation.
 
-def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+    - Buffer/BufferLoad/BufferRegion -> returns a tl.region call (PrimExpr)
+    - tl.region Call -> returns the decoded BufferRegion for analysis
     """
-    Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+    from tilelang.language.frame import has_let_value, get_let_value
 
-    - Buffer -> full-region BufferRegion covering entire shape
-    - BufferRegion -> returned as-is
-    - BufferLoad -> best-effort convert via get_buffer_region_from_load;
-      if scalar, fall back to 1-sized ranges at given indices
-    """
+    if isinstance(obj, tir.Var) and has_let_value(obj):
+        obj = get_let_value(obj)
+    # Encode into tl.region call (when extents is provided), otherwise return BufferRegion for analysis
     if isinstance(obj, tir.BufferRegion):
-        return obj
+        if extents is None:
+            return obj
+        mins = [r.min for r in obj.region]
+        exts = [r.extent for r in obj.region]
+        assert len(extents) == len(exts)
+        exts = [tir.min(exts[i], extents[i]) for i in range(len(exts))]
+        return _make_region_call(tir.BufferLoad(obj.buffer, mins), access_type, *exts)
     if isinstance(obj, tir.Buffer):
         mins = [tir.IntImm("int32", 0) for _ in obj.shape]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
-        return tir.BufferRegion(obj, ranges)
+        if extents is None:
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return tir.BufferRegion(obj, ranges)
+        exts = list(extents)
+        return _make_region_call(tir.BufferLoad(obj, mins), access_type, *exts)
     if isinstance(obj, tir.BufferLoad):
-        region = get_buffer_region_from_load(obj)
-        if region is not None:
-            return region
-        # Fallback: scalar load -> 1-sized ranges at indices
-        mins = [idx for idx in obj.indices]
-        ones = [tir.IntImm("int32", 1) for _ in obj.indices]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
-        return tir.BufferRegion(obj.buffer, ranges)
-    raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
-
-
-def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
+        if extents is None:
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return tir.BufferRegion(obj.buffer, ranges)
+        exts = list(extents)
+        if len(obj.indices) > len(exts):
+            exts = [tir.IntImm("int32", 1) for _ in range(len(obj.indices) - len(exts))] + exts
+        assert len(obj.indices) == len(exts)
+        return _make_region_call(obj, access_type, *exts)
+    raise ValueError(f"Unsupported argument type for to_buffer_region: {type(obj)}")
+
+
+def retrieve_shape(obj: BufferLikeType) -> list:
     """
     Retrieve shape-like extents for a buffer-like object.
 
@@ -231,7 +238,7 @@ def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_shape argument type: {type(obj)} for object {obj}")
 
 
-def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
+def retrieve_stride(obj: BufferLikeType) -> list:
     """
     Retrieve row-major strides for a buffer-like object based on its buffer.shape.
 
@@ -252,8 +259,7 @@ def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
     return strides
 
 
-def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion,
-                                   access_type: str = "r") -> PrimExpr:
+def retrive_ptr_from_buffer_region(buffer_or_load_or_region: BufferLikeType, access_type: str = "r") -> PrimExpr:
     if isinstance(buffer_or_load_or_region, Buffer):
         return buffer_or_load_or_region.access_ptr(access_type)
     elif isinstance(buffer_or_load_or_region, BufferLoad):
@@ -283,7 +289,7 @@ def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad
 
 
 def retrieve_ptr(
-    obj: Buffer | BufferRegion | BufferLoad,
+    obj: BufferLikeType,
     access_type: str = "r",
     ignore_last_ndim: int = 0,
 ) -> PrimExpr:
@@ -329,7 +335,7 @@ def retrieve_ptr(
     raise ValueError(f"Unsupported retrieve_ptr argument type: {type(obj)} for object {obj}")
 
 
-def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
+def retrieve_offset(obj: BufferLikeType) -> list:
     """
     Retrieve per-dimension minima offsets.
 
@@ -349,6 +355,44 @@ def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_offset argument type: {type(obj)} for object {obj}")
 
 
+def retrieve_dtype(obj: BufferLikeType) -> str:
+    """
+    Retrieve the dtype of a buffer-like object.
+
+    - Buffer -> buffer.dtype
+    - BufferRegion -> convert to BufferLoad with Ramp indices, then use load.dtype
+    - BufferLoad -> load.dtype
+    """
+    if isinstance(obj, tir.Buffer):
+        return obj.dtype
+    if isinstance(obj, tir.BufferRegion):
+        # Convert region ranges to indices, using Ramp for vector access
+        indices = []
+        for r in obj.region:
+            extent = r.extent
+            if isinstance(extent, tir.IntImm) and extent.value == 1:
+                indices.append(r.min)
+            else:
+                # Use Ramp for vector access: Ramp(base, stride=1, lanes=extent)
+                indices.append(tir.Ramp(r.min, 1, extent))
+        load = tir.BufferLoad(obj.buffer, indices)
+        return load.dtype
+    if isinstance(obj, tir.BufferLoad):
+        return obj.dtype
+    raise ValueError(f"Unsupported retrieve_dtype argument type: {type(obj)} for object {obj}")
+
+
+def bits_product(shape: list[PrimExpr], dtype: str) -> PrimExpr:
+    """
+    Compute the number of bits in a Buffer (shape with dtype)."""
+    if len(shape) == 0:
+        return tir.IntImm("int32", 1)
+    result = shape[0]
+    for i in range(1, len(shape)):
+        result = result * shape[i]
+    return result * DataType(dtype).bits
+
+
 def prim_expr_equal(lhs, rhs) -> bool:
     """
     Robust equality for PrimExpr shapes/extents.
@@ -367,6 +411,52 @@ def prim_expr_equal(lhs, rhs) -> bool:
     return tir.analysis.expr_deep_equal(lhs, rhs)
 
 
+def legalize_pairwise_extents(src_extents: list, dst_extents: list) -> tuple[list, list]:
+    """
+    Right-align and broadcast two extent lists to be mutually compatible.
+
+    Early-exit rule:
+    - If the number of non-1 dimensions in `src_extents` equals that in `dst_extents`,
+      no adjustment is made; the original extents are returned unchanged. This
+      preserves the per-dimension iteration mapping (one loop var per non-1 dim)
+      and avoids creating extra varying axes on either side.
+
+    Otherwise, for each pair of tail-aligned dimensions (x, y):
+      - if x == y: keep both
+      - elif x == 1: set x = y
+      - elif y == 1: set y = x
+      - else: promote both to tir.max(x, y) to handle dynamic-vs-static safely
+
+    Leading unmatched dimensions are kept as-is.
+
+    Returns a tuple of new lists (src_new, dst_new).
+    """
+    a = list(src_extents)
+    b = list(dst_extents)
+
+    # If both sides have the same number of non-1 extents, don't re-broadcast.
+    def _num_non_one(exts: list) -> int:
+        return sum(0 if prim_expr_equal(x, 1) else 1 for x in exts)
+
+    if _num_non_one(a) == _num_non_one(b):
+        return a, b
+    k = min(len(a), len(b))
+    for i in range(1, k + 1):
+        x, y = a[-i], b[-i]
+        if prim_expr_equal(x, y):
+            continue
+        elif prim_expr_equal(x, 1):
+            a[-i] = y
+        elif prim_expr_equal(y, 1):
+            b[-i] = x
+        else:
+            # Dynamic mismatch: promote to max so downstream clamping/predicates remain safe
+            m = tir.max(x, y)
+            a[-i] = m
+            b[-i] = m
+    return a, b
+
+
 def is_full_region(buffer_region: BufferRegion) -> bool:
     """
     Check whether a BufferRegion covers the full buffer region.
@@ -397,3 +487,33 @@ def is_full_region(buffer_region: BufferRegion) -> bool:
         if not expr_equal(r.extent, dim):
             return False
     return True
+
+
+def get_prim_func_name(func: PrimFunc | None, default: str | None = None) -> str | None:
+    """
+    Extract a human‑readable function name from a TVM PrimFunc.
+
+    Prefer the `global_symbol` attribute set on the PrimFunc. If it is missing
+    (e.g., private PrimFunc without a global symbol), return the provided
+    `default` value.
+
+    Args:
+        func: TVM PrimFunc instance or None.
+        default: Fallback name to return when no name can be determined.
+
+    Returns:
+        The function name as a string, or `default` when unavailable.
+    """
+    if func is None:
+        return default
+    try:
+        name = func.attrs["global_symbol"]
+        return str(name) if name is not None else default
+    except Exception:
+        return default
+
+
+def side_effect(expr: PrimExpr) -> CallEffectKind:
+    from tilelang import _ffi_api
+
+    return _ffi_api.SideEffect(expr)
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index cd364b8bb..fa227b07d 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -3,25 +3,27 @@
 import torch
 import warnings
 from tilelang.contrib import nvcc
+from tilelang.utils.tensor import is_float8_dtype, fp8_remove_negative_zeros_
 from torch.utils.cpp_extension import load, _import_module_from_library
 from tilelang import env
 
+# Include version information to ensure different versions use separate caches
+from tilelang import __version__
+
 # Define paths
 compress_util = os.path.join(env.TILELANG_TEMPLATE_PATH, "tl_templates/cuda/compress_sm90.cu")
 # Cache directory for compiled extensions
-_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor")
+_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor", __version__)
 os.makedirs(_CACHE_DIR, exist_ok=True)
 
 
 def _get_cached_lib():
-    name = 'compress_lib'
-    cached_path = os.path.join(_CACHE_DIR, f"{name}.so")
+    name = "compress_lib"
 
-    if os.path.exists(cached_path):
+    if os.path.exists(os.path.join(_CACHE_DIR, f"{name}.so")):
         try:
-            return _import_module_from_library(name, cached_path)
+            return _import_module_from_library(name, _CACHE_DIR, is_python_module=True)
         except Exception:
-            # If loading fails, recompile
             pass
 
     # Set TORCH_CUDA_ARCH_LIST
@@ -32,24 +34,22 @@ def _get_cached_lib():
         name=name,
         sources=[compress_util],
         extra_cuda_cflags=[
-            '-O2',
-            '-std=c++17',
-            '-lineinfo',
-            f'-I{env.CUTLASS_INCLUDE_DIR}',
-            f'-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include',
-            '-arch=sm_90',
+            "-O2",
+            "-std=c++17",
+            "-lineinfo",
+            f"-I{env.CUTLASS_INCLUDE_DIR}",
+            f"-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include",
+            "-arch=sm_90",
         ],
         build_directory=_CACHE_DIR,
     )
 
 
-def compress_sm90(A: torch.Tensor, block_k: int,
-                  transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
+def compress_sm90(A: torch.Tensor, block_k: int, transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
     if block_k > 128:
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
-        warnings.warn(
-            f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
+        warnings.warn(f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
     # Load the library (will use cache if available)
     compress_lib = _get_cached_lib()
 
@@ -60,8 +60,9 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
     try:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
     except ImportError as err:
-        raise ImportError("SparseSemiStructuredTensor is not available in this version of PyTorch. "
-                          "Please install a compatible version.") from err
+        raise ImportError(
+            "SparseSemiStructuredTensor is not available in this version of PyTorch. Please install a compatible version."
+        ) from err
     orig_val = SparseSemiStructuredTensor._FORCE_CUTLASS
     try:
         SparseSemiStructuredTensor._FORCE_CUTLASS = True
@@ -73,10 +74,7 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
         SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
 
 
-def compress(A: torch.Tensor,
-             transposed: bool,
-             arch: str | None = None,
-             **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+def compress(A: torch.Tensor, transposed: bool, arch: str | None = None, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compress a tensor using the appropriate method based on the CUDA architecture.
     """
@@ -88,13 +86,23 @@ def compress(A: torch.Tensor,
     if compute_version >= (9, 0):
         return compress_sm90(A, transposed=transposed, **kwargs)
     elif compute_version >= (8, 0):
-        return compress_sm80(A, transposed=transposed)
+        if transposed:
+            A = A.t().contiguous()
+        origin_dtype = A.dtype
+        if is_float8_dtype(origin_dtype):
+            fp8_remove_negative_zeros_(A)
+            A = A.view(torch.int8)
+        A_sp, E = compress_sm80(A, transposed=False)
+        if is_float8_dtype(origin_dtype):
+            A_sp = A_sp.view(origin_dtype)
+        if transposed:
+            A_sp = A_sp.t().contiguous()
+        return A_sp, E
     else:
-        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. "
-                         "Supported versions are sm_80 and sm_90.")
+        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. Supported versions are sm_80 and sm_90.")
 
 
-def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transposed: bool = False):
+def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a random semi-sparse tensor. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -105,6 +113,8 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.randn((M, K), dtype=torch.float, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
@@ -114,11 +124,31 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
     return tensor.to(dtype)  # dtype like float8 might not have randn kernel
 
 
-def arange_semi_sparse(M: int,
-                       K: int,
-                       dtype=torch.float16,
-                       device='cuda',
-                       transposed: bool = False):
+def randint_semi_sparse(M: int, K: int, low: int, high: int, dtype=torch.int32, device="cuda", transposed: bool = False):
+    """
+    Generate a random semi-sparse integer tensor. The generated tensor will have 2:4 sparsity along the K dimension.
+    Args:
+        M (int): Number of rows
+        K (int): Number of columns
+        low (int): Lower bound of the random integers
+        high (int): Upper bound of the random integers
+        dtype: Data type of the tensor
+        device: Device to create the tensor on
+        transposed (bool): If True, returns a transposed tensor of shape (K, M)
+    """
+    elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
+    tensor = torch.randint(low, high, (M, K), dtype=dtype, device=device).view(M, -1, group)
+    indice = tensor.topk(elem, dim=-1).indices
+    tensor.scatter_(-1, indice, 0)
+    tensor = tensor.view(M, K)
+    if transposed:
+        tensor = tensor.t().contiguous()
+    return tensor
+
+
+def arange_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a semi-sparse tensor with values from 0 to M*K-1. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -129,6 +159,8 @@ def arange_semi_sparse(M: int,
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.arange(M * K, dtype=dtype, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 094c099fe..0c1904b38 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -1,4 +1,8 @@
 from __future__ import annotations
+
+
+import torch
+
 from platform import mac_ver
 from typing import Literal
 from tilelang import tvm as tvm
@@ -15,6 +19,7 @@
     "llvm": "LLVM CPU target (accepts standard TVM LLVM options).",
     "webgpu": "WebGPU target for browser/WebGPU runtimes.",
     "c": "C source backend.",
+    "cutedsl": "CuTe DSL GPU target.",
 }
 
 
@@ -56,11 +61,63 @@ def check_metal_availability() -> bool:
     if not mac_release:
         return False
     # todo: check torch version?
-    return arch == 'arm64'
+    return arch == "arm64"
+
+
+def determine_fp8_type(fp8_format: Literal["e4m3", "e5m2"] = "e4m3") -> str:
+    """
+    Select the correct FP8 dtype string for the current platform.
+    - CUDA defaults to FP8 E4M3FN / E5M2.
+    - ROCm uses FNUZ except gfx950 (OCP), which prefers non-FNUZ when available.
+    """
+    if fp8_format not in {"e4m3", "e5m2"}:
+        raise ValueError(f"Unsupported FP8 format: {fp8_format}")
+    if torch.version.hip is None:
+        return "float8_e4m3fn" if fp8_format == "e4m3" else "float8_e5m2"
+    if not torch.cuda.is_available():
+        return "float8_e4m3fnuz" if fp8_format == "e4m3" else "float8_e5m2fnuz"
+    props = torch.cuda.get_device_properties(0)
+    gcn_arch = getattr(props, "gcnArchName", "")
+    if fp8_format == "e4m3":
+        if gcn_arch.startswith("gfx950"):
+            return "float8_e4m3fn"
+        return "float8_e4m3fnuz"
+    if gcn_arch.startswith("gfx950") and hasattr(torch, "float8_e5m2"):
+        return "float8_e5m2"
+    return "float8_e5m2fnuz"
+
+
+def determine_torch_fp8_type(fp8_format: Literal["e4m3", "e5m2"] = "e4m3") -> torch.dtype:
+    dtype_name = determine_fp8_type(fp8_format)
+    torch_dtype = getattr(torch, dtype_name, None)
+    if torch_dtype is None:
+        raise RuntimeError(f"PyTorch does not expose dtype {dtype_name}")
+    return torch_dtype
+
+
+def normalize_cutedsl_target(target: str | Target) -> Target | None:
+    if isinstance(target, Target):
+        if target.kind.name == "cuda" and "cutedsl" in target.keys:
+            return target
+        return None
+
+    if target.startswith("cutedsl"):
+        cuda_target_str = target.replace("cutedsl", "cuda", 1)
+
+        try:
+            temp_target = Target(cuda_target_str)
 
+            target_dict = dict(temp_target.export())
+            target_dict["keys"] = list(set(target_dict["keys"]) | {"cutedsl"})
 
-def determine_target(target: str | Target | Literal["auto"] = "auto",
-                     return_object: bool = False) -> str | Target:
+            return Target(target_dict)
+        except Exception:
+            return None
+
+    return None
+
+
+def determine_target(target: str | Target | Literal["auto"] = "auto", return_object: bool = False) -> str | Target:
     """
     Determine the appropriate target for compilation (CUDA, HIP, or manual selection).
 
@@ -89,33 +146,50 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
 
         # Determine the target based on availability
         if is_cuda_available:
-            return_var = "cuda"
+            if torch.cuda.is_available() and (cap := torch.cuda.get_device_capability(0)):
+                return_var = Target({"kind": "cuda", "arch": f"sm_{nvcc.get_target_arch(cap)}"})
+            else:
+                return_var = "cuda"
         elif is_hip_available:
             return_var = "hip"
         elif check_metal_availability():
             return_var = "metal"
         else:
             raise ValueError("No CUDA or HIP or MPS available on this system.")
+
     else:
-        # Validate the target if it's not "auto"
-        if isinstance(target, Target):
-            return_var = target
-        elif isinstance(target, str):
-            normalized_target = target.strip()
-            if not normalized_target:
-                raise AssertionError(f"Target {target} is not supported")
+        possible_cutedsl_target = normalize_cutedsl_target(target)
+        if possible_cutedsl_target is not None:
             try:
-                Target(normalized_target)
-            except Exception as err:
-                examples = ", ".join(f"`{name}`" for name in SUPPORTED_TARGETS)
-                raise AssertionError(
-                    f"Target {target} is not supported. Supported targets include: {examples}. "
-                    "Pass additional options after the base name, e.g. `cuda -arch=sm_80`."
-                ) from err
-            return_var = normalized_target
+                from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available  # lazy
+
+                check_cutedsl_available()
+            except ImportError as e:
+                raise AssertionError(f"CuTeDSL backend is not available. Please install tilelang-cutedsl package. {str(e)}") from e
+
+            return_var = possible_cutedsl_target
         else:
-            raise AssertionError(f"Target {target} is not supported")
+            # Validate the target if it's not "auto"
+            if isinstance(target, Target):
+                return_var = target
+            elif isinstance(target, str):
+                normalized_target = target.strip()
+                if not normalized_target:
+                    raise AssertionError(f"Target {target} is not supported")
+                try:
+                    Target(normalized_target)
+                except Exception as err:
+                    examples = ", ".join(f"`{name}`" for name in SUPPORTED_TARGETS)
+                    raise AssertionError(
+                        f"Target {target} is not supported. Supported targets include: {examples}. "
+                        "Pass additional options after the base name, e.g. `cuda -arch=sm_80`."
+                    ) from err
+                return_var = normalized_target
+            else:
+                raise AssertionError(f"Target {target} is not supported")
 
+    if isinstance(return_var, Target):
+        return return_var
     if return_object:
         if isinstance(return_var, Target):
             return return_var
@@ -131,6 +205,10 @@ def target_is_hip(target: Target) -> bool:
     return _ffi_api.TargetIsRocm(target)
 
 
+def target_is_metal(target: Target) -> bool:
+    return _ffi_api.TargetIsMetal(target)
+
+
 def target_is_volta(target: Target) -> bool:
     return _ffi_api.TargetIsVolta(target)
 
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 51f63db4a..13ce194fb 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,13 +1,27 @@
-from __future__ import annotations
 """The profiler and convert to torch utils"""
+
 from enum import Enum
 import torch
-from tvm import runtime
 from tvm import tir
-from torch.utils.dlpack import to_dlpack
 import numpy as np
 
 
+def is_float8_dtype(dtype: torch.dtype) -> bool:
+    return dtype in {
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+    }
+
+
+def fp8_remove_negative_zeros_(tensor: torch.Tensor):
+    assert is_float8_dtype(tensor.dtype), "Input tensor must be of float8 dtype"
+    bits = tensor.view(torch.uint8)
+    zeros_mask = tensor == 0
+    bits[zeros_mask] = 0x00
+
+
 class TensorSupplyType(Enum):
     Integer = 1
     Uniform = 2
@@ -18,56 +32,53 @@ class TensorSupplyType(Enum):
     Auto = 7
 
 
-def map_torch_type(intype: str) -> torch.dtype:
+def map_torch_type(intype) -> torch.dtype:
+    # Convert to string if needed
+    if not isinstance(intype, str):
+        intype = str(intype)
+
     if intype == "float8_e4m3":
-        assert hasattr(torch, "float8_e4m3fn"), \
-            "torch.float8_e4m3fn is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e4m3fn
     elif intype == "float8_e5m2":
-        assert hasattr(torch, "float8_e5m2"), \
-            "torch.float8_e5m2 is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e5m2
     elif intype == "e4m3fnuz_float8":
-        assert hasattr(torch, "float8_e4m3fnuz"), \
-            "torch.float8_e4m3fnuz is not supported in this version of torch" \
-                "Please upgrade torch >= 2.2.0"
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
+        )
         return torch.float8_e4m3fnuz
+    elif intype == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif intype == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif "float4" in intype:
+        # PyTorch doesn't support float4, use int8 as storage type
+        return torch.int8
     else:
         return getattr(torch, intype)
 
 
-def adapt_torch2tvm(arg):
-    float8_dtype_map = {
-        torch.float8_e4m3fn: "float8_e4m3",
-        torch.float8_e4m3fnuz: "float8_e4m3",
-        torch.float8_e5m2: "float8_e5m2",
-        torch.float8_e5m2fnuz: "float8_e5m2",
-    }
-    if isinstance(arg, torch.Tensor):
-        if arg.dtype in {
-                torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz
-        }:
-            return runtime.from_dlpack(to_dlpack(arg.view(torch.int8)))._create_view(
-                shape=arg.shape, dtype=float8_dtype_map[arg.dtype])
-        return runtime.from_dlpack(to_dlpack(arg))
-    return arg
-
-
 def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
-
     from tilelang.engine.param import KernelParam
     from .device import get_current_device
 
     def get_tensor(param: KernelParam) -> torch.Tensor:
-        dtype: torch.dtype = param.dtype
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        dtype: torch.dtype = param.torch_dtype()
         device = get_current_device()
 
         if hasattr(param, "shape") and not param.shape:
             raise ValueError(
                 f"TensorType must have a shape, but got {type(param)}, "
-                "likely you are trying to generate a random tensor with a dynamic symbolic shape.")
+                "likely you are trying to generate a random tensor with a dynamic symbolic shape."
+            )
 
         # Check if with dynamic symbolic shape
         for shape in param.shape:
@@ -81,12 +92,14 @@ def get_tensor(param: KernelParam) -> torch.Tensor:
         if supply_type == TensorSupplyType.Auto:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             elif dtype in {torch.float16, torch.float32, torch.bfloat16}:
@@ -95,30 +108,30 @@ def get_tensor(param: KernelParam) -> torch.Tensor:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
 
         if dtype == torch.int8 and supply_type in [
-                TensorSupplyType.Uniform,
-                TensorSupplyType.Normal,
+            TensorSupplyType.Uniform,
+            TensorSupplyType.Normal,
         ]:
             return torch.ones(*shape, device=device, dtype=dtype)
 
         if supply_type == TensorSupplyType.Integer:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             else:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
         elif supply_type == TensorSupplyType.Uniform:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Normal:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Randn:
             return torch.randn(*shape, device=device).to(dtype)
         elif supply_type == TensorSupplyType.Zero:
@@ -154,9 +167,7 @@ def _compare_attributes(
     """
 
     def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
-        raise AssertionError(
-            f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}."
-        )
+        raise AssertionError(f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.")
 
     if actual.shape != expected.shape:
         raise_mismatch_error("shape", actual.shape, expected.shape)
@@ -167,7 +178,7 @@ def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
     if actual.layout != expected.layout:
         if check_layout:
             raise_mismatch_error("layout", actual.layout, expected.layout)
-    elif (actual.layout == torch.strided and check_stride and actual.stride() != expected.stride()):
+    elif actual.layout == torch.strided and check_stride and actual.stride() != expected.stride():
         raise_mismatch_error("stride()", actual.stride(), expected.stride())
     if check_device and actual.device != expected.device:
         raise_mismatch_error("device", actual.device, expected.device)
@@ -175,8 +186,7 @@ def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
         raise_mismatch_error("dtype", actual.dtype, expected.dtype)
 
 
-def _equalize_attributes(actual: torch.Tensor,
-                         expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def _equalize_attributes(actual: torch.Tensor, expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """Equalizes some attributes of two tensors for value comparison.
     If ``actual`` and ``expected`` are ...
     - ... not on the same :attr:`~torch.Tensor.device`, they are moved CPU memory.
@@ -214,7 +224,7 @@ def _equalize_attributes(actual: torch.Tensor,
     if actual.layout != expected.layout:
         # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
         actual = actual.to_dense() if actual.layout != torch.strided else actual
-        expected = (expected.to_dense() if expected.layout != torch.strided else expected)
+        expected = expected.to_dense() if expected.layout != torch.strided else expected
     return actual, expected
 
 
@@ -258,12 +268,8 @@ def torch_assert_close(
     """
 
     _compare_attributes(
-        tensor_a,
-        tensor_b,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride)
+        tensor_a, tensor_b, check_device=check_device, check_dtype=check_dtype, check_layout=check_layout, check_stride=check_stride
+    )
     tensor_a, tensor_b = _equalize_attributes(tensor_a, tensor_b)
 
     mismatched = ~torch.isclose(tensor_a, tensor_b, rtol=rtol, atol=atol, equal_nan=equal_nan)
@@ -280,8 +286,7 @@ def torch_assert_close(
 
     # Print debug information about the mismatch
     if verbose:
-        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} "
-              f"(allowed: {max_allowed_mismatched})")
+        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} (allowed: {max_allowed_mismatched})")
 
     # If there are mismatched elements, print the first mismatch
     if num_mismatched > 0:
@@ -293,9 +298,9 @@ def torch_assert_close(
         b_val = tensor_b.reshape(-1)[flat_idx].item()
         abs_diff = abs(a_val - b_val)
         rel_diff = abs_diff / (abs(b_val) + 1e-12)
-        mismatch_info = (f"\nFirst mismatch at index {idx}: "
-                         f"lhs={a_val:.6f}, rhs={b_val:.6f}, "
-                         f"abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}")
+        mismatch_info = (
+            f"\nFirst mismatch at index {idx}: lhs={a_val:.6f}, rhs={b_val:.6f}, abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}"
+        )
     else:
         mismatch_info = ""
 
@@ -308,6 +313,7 @@ def torch_assert_close(
             f"\nGreatest absolute difference: {diff.max().item()}, "
             f"Greatest relative difference: {(diff / (torch.abs(tensor_b) + 1e-12)).max().item()}"
             f"\n{base_name}: {tensor_a}"
-            f"\n{ref_name}: {tensor_b}")
+            f"\n{ref_name}: {tensor_b}"
+        )
     else:
         return True
diff --git a/tilelang/utils/version.py b/tilelang/utils/version.py
new file mode 100644
index 000000000..46d23d753
--- /dev/null
+++ b/tilelang/utils/version.py
@@ -0,0 +1,31 @@
+"""Version utilities for tilelang."""
+
+from __future__ import annotations
+
+import re
+
+
+def build_date(version_str: str | None = None) -> int | None:
+    """Extract build date (YYYYMMDD) from version string.
+
+    Args:
+        version_str: Version string like "0.1.7.post3+cuda.d20260127.gita17230e4".
+                     If None, uses tilelang.__version__.
+
+    Returns:
+        Build date as integer (e.g., 20260127), or None if not found.
+
+    Example:
+        >>> import tilelang
+        >>> if tilelang.build_date() >= 20260127:
+        ...     print("Version meets requirement")
+    """
+    if version_str is None:
+        import tilelang
+
+        version_str = tilelang.__version__
+
+    match = re.search(r"\.d(\d{8})\.", version_str)
+    if match:
+        return int(match.group(1))
+    return None
diff --git a/version_provider.py b/version_provider.py
index 3eb45aac9..9b7f258c3 100644
--- a/version_provider.py
+++ b/version_provider.py
@@ -3,34 +3,32 @@
 import os
 import platform
 import subprocess
+import time
 from pathlib import Path
 from functools import lru_cache
 
 ROOT = Path(__file__).parent
 
-base_version = (ROOT / 'VERSION').read_text().strip()
+base_version = (ROOT / "VERSION").read_text().strip()
 # When installing a sdist,
 # the installed version needs to match the sdist version,
 # so pip will complain when we install `tilelang-0.1.6.post2+gitxxxx.tar.gz`.
 # To workaround that, when building sdist,
 # we do not add version label and use a file to store the git hash instead.
-git_pin = ROOT / '.git_commit.txt'
+git_pin = ROOT / ".git_commit.txt"
 
 
 def _read_cmake_bool(i: str | None, default=False):
     if i is None:
         return default
-    return i.lower() not in ('0', 'false', 'off', 'no', 'n', '')
+    return i.lower() not in ("0", "false", "off", "no", "n", "")
 
 
 @lru_cache(maxsize=1)
 def get_git_commit_id() -> str | None:
     """Get the current git commit hash by running git in the current file's directory."""
 
-    r = subprocess.run(['git', 'rev-parse', 'HEAD'],
-                       cwd=ROOT,
-                       capture_output=True,
-                       encoding='utf-8')
+    r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=ROOT, capture_output=True, encoding="utf-8")
     if r.returncode == 0:
         _git = r.stdout.strip()
         git_pin.write_text(_git)
@@ -41,51 +39,52 @@ def get_git_commit_id() -> str | None:
         return None
 
 
-def dynamic_metadata(
-    field: str,
-    settings: dict[str, object] | None = None,
-) -> str:
-    assert field == 'version'
+def dynamic_metadata(field: str, settings: dict[str, object] | None = None) -> str:
+    assert field == "version"
 
     version = base_version
 
     # generate git version for sdist
     get_git_commit_id()
 
-    if not _read_cmake_bool(os.environ.get('NO_VERSION_LABEL')):
+    if not _read_cmake_bool(os.environ.get("NO_VERSION_LABEL")):
         exts = []
         backend = None
-        if _read_cmake_bool(os.environ.get('NO_TOOLCHAIN_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_TOOLCHAIN_VERSION")):
             pass
-        elif platform.system() == 'Darwin':
+        elif platform.system() == "Darwin":
             # only on macosx_11_0_arm64, not necessary
             # backend = 'metal'
             pass
-        elif _read_cmake_bool(os.environ.get('USE_ROCM', '')):
-            backend = 'rocm'
-        elif 'USE_CUDA' in os.environ and not _read_cmake_bool(os.environ.get('USE_CUDA')):
-            backend = 'cpu'
+        elif _read_cmake_bool(os.environ.get("USE_ROCM", "")):
+            backend = "rocm"
+        elif "USE_CUDA" in os.environ and not _read_cmake_bool(os.environ.get("USE_CUDA")):
+            backend = "cpu"
         else:  # cuda
             # Read nvcc version from env.
             # This is not exactly how it should be,
             # but works for now if building in a nvidia/cuda image.
-            if cuda_version := os.environ.get('CUDA_VERSION'):
-                major, minor, *_ = cuda_version.split('.')
-                backend = f'cu{major}{minor}'
+            if cuda_version := os.environ.get("CUDA_VERSION"):
+                major, minor, *_ = cuda_version.split(".")
+                backend = f"cu{major}{minor}"
             else:
-                backend = 'cuda'
+                backend = "cuda"
         if backend:
             exts.append(backend)
 
-        if _read_cmake_bool(os.environ.get('NO_GIT_VERSION')):
+        # Add build date if TILELANG_BUILD_WHEEL_WITH_DATE is set
+        if _read_cmake_bool(os.environ.get("TILELANG_BUILD_WHEEL_WITH_DATE")):
+            exts.append(f"d{time.strftime('%Y%m%d')}")
+
+        if _read_cmake_bool(os.environ.get("NO_GIT_VERSION")):
             pass
         elif git_hash := get_git_commit_id():
-            exts.append(f'git{git_hash[:8]}')
+            exts.append(f"git{git_hash[:8]}")
         else:
-            exts.append('gitunknown')
+            exts.append("gitunknown")
 
         if exts:
-            version += '+' + '.'.join(exts)
+            version += "+" + ".".join(exts)
 
     return version